comparison PsiCLASS-1.0.2/alignments.hpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 // The class handles reading the bam file
2
3 #ifndef _LSONG_RSCAF_ALIGNMENT_HEADER
4 #define _LSONG_RSCAF_ALIGNMENT_HEADER
5
6 #include "samtools-0.1.19/sam.h"
7 #include <map>
8 #include <string>
9 #include <assert.h>
10 #include <iostream>
11 #include <stdlib.h>
12 #include <math.h>
13
14 #include "defs.h"
15
16 class Alignments
17 {
18 private:
19 samfile_t *fpSam ;
20 bam1_t *b ;
21
22 char fileName[1024] ;
23 bool opened ;
24 std::map<std::string, int> chrNameToId ;
25 bool allowSupplementary ;
26 bool allowClip ;
27 bool hasClipHead, hasClipTail ;
28 int segmentsSum ; // the sum of segments.
29
30 bool atBegin ;
31 bool atEnd ;
32
33 static int CompInt( const void *p1, const void *p2 )
34 {
35 return (*(int *)p1 ) - (*(int *)p2 ) ;
36 }
37
38 void Open()
39 {
40 fpSam = samopen( fileName, "rb", 0 ) ;
41 if ( !fpSam->header )
42 {
43 fprintf( stderr, "Can not open %s.\n", fileName ) ;
44 exit( 1 ) ;
45 }
46
47 // Collect the chromosome information
48 for ( int i = 0 ; i < fpSam->header->n_targets ; ++i )
49 {
50 std::string s( fpSam->header->target_name[i] ) ;
51 chrNameToId[s] = i ;
52 }
53 opened = true ;
54 atBegin = true ;
55 atEnd = false ;
56 }
57 public:
58 struct _pair segments[MAX_SEG_COUNT] ;
59 int segCnt ;
60
61 int totalReadCnt ;
62 int fragLen, fragStdev ;
63 int readLen ;
64 bool matePaired ;
65
66 Alignments()
67 {
68 b = NULL ;
69 opened = false ;
70 atBegin = true ;
71 atEnd = false ;
72 allowSupplementary = false ;
73 allowClip = true ;
74
75 totalReadCnt = 0 ;
76 fragLen = 0 ;
77 fragStdev = 0 ;
78 readLen = 0 ;
79 matePaired = false ;
80 }
81 ~Alignments()
82 {
83 if ( b )
84 bam_destroy1( b ) ;
85 }
86
87 void Open( char *file )
88 {
89 strcpy( fileName, file ) ;
90 Open() ;
91 }
92
93 void Rewind()
94 {
95 Close() ;
96 Open() ;
97
98 atBegin = true ;
99 atEnd = false ;
100 }
101
102 void Close()
103 {
104 samclose( fpSam ) ;
105 fpSam = NULL ;
106 }
107
108 bool IsOpened()
109 {
110 return opened ;
111 }
112
113 bool IsAtBegin()
114 {
115 return atBegin ;
116 }
117
118 bool IsAtEnd()
119 {
120 return atEnd ;
121 }
122
123 bool HasClip()
124 {
125 return hasClipHead || hasClipTail ;
126 }
127
128 bool HasClipHead()
129 {
130 return hasClipHead ;
131 }
132 bool HasClipTail()
133 {
134 return hasClipTail ;
135 }
136
137 int Next()
138 {
139 int i ;
140 int start = 0, len = 0 ;
141 uint32_t *rawCigar ;
142
143 if ( atBegin == true )
144 totalReadCnt = 0 ;
145
146 atBegin = false ;
147 while ( 1 )
148 {
149 while ( 1 )
150 {
151 if ( b )
152 bam_destroy1( b ) ;
153 b = bam_init1() ;
154
155 if ( samread( fpSam, b ) <= 0 )
156 {
157 atEnd = true ;
158 return 0 ;
159 }
160
161 if ( ( b->core.flag & 0x900 ) == 0 )
162 ++totalReadCnt ;
163
164 if ( b->core.flag & 0xC )
165 continue ;
166
167 //if ( ( b->core.flag & 0x900 ) == 0 )
168 break ;
169 }
170
171 // to many repeat.
172 if ( bam_aux_get( b, "NH" ) )
173 {
174 if ( bam_aux2i( bam_aux_get( b, "NH" ) ) >= 5 )
175 continue ;
176 }
177
178 // Compute the exons segments from the reads
179 segCnt = 0 ;
180 start = b->core.pos ; //+ 1 ;
181 rawCigar = bam1_cigar( b ) ;
182 // Check whether the query length is compatible with the read
183 if ( bam_cigar2qlen( &b->core, rawCigar ) != b->core.l_qseq )
184 continue ;
185
186 bool clipMiddle = false ;
187 int clipSum = 0 ;
188 hasClipHead = hasClipTail = false ;
189 len = 0 ;
190 segmentsSum = 0 ;
191 for ( i = 0 ; i < b->core.n_cigar ; ++i )
192 {
193 int op = rawCigar[i] & BAM_CIGAR_MASK ;
194 int num = rawCigar[i] >> BAM_CIGAR_SHIFT ;
195
196 switch ( op )
197 {
198 case BAM_CMATCH:
199 case BAM_CDEL:
200 len += num ; break ;
201 case BAM_CSOFT_CLIP:
202 case BAM_CHARD_CLIP:
203 case BAM_CPAD:
204 {
205 if ( i == 0 )
206 hasClipHead = true ;
207 else if ( i == b->core.n_cigar - 1 )
208 hasClipTail = true ;
209 else
210 clipMiddle = true ;
211
212 clipSum += num ;
213 }
214 case BAM_CINS:
215 num = 0 ; break ;
216 case BAM_CREF_SKIP:
217 {
218 segments[ segCnt ].a = start ;
219 segments[ segCnt ].b = start + len - 1 ;
220 ++segCnt ;
221 segmentsSum += len ;
222 start = start + len + num ;
223 len = 0 ;
224 } break ;
225 default:
226 len += num ; break ;
227 }
228 }
229 if ( clipMiddle ) // should never happend
230 continue ;
231
232 if ( clipSum >= 2 && !allowClip )
233 continue ;
234
235 if ( len > 0 )
236 {
237 segments[ segCnt ].a = start ;
238 segments[ segCnt ].b = start + len - 1 ;
239 ++segCnt ;
240 segmentsSum += len ;
241 }
242 /*if ( !strcmp( bam1_qname( b ), "chr1:109656301-109749401W:ENST00000490758.2:381:1480:1090:1290:X" ) )
243 {
244 for ( i = 0 ; i < segCnt ; ++i )
245 printf( "(%d %d) ", segments[i].a, segments[i].b ) ;
246 printf( "\n" ) ;
247 }*/
248
249 // Check whether the mates are compatible
250 //int mChrId = b->core.mtid ;
251 int64_t mPos = b->core.mpos ;
252
253 if ( b->core.mtid == b->core.tid )
254 {
255 for ( i = 0 ; i < segCnt - 1 ; ++i )
256 {
257 if ( mPos >= segments[i].b && mPos <= segments[i + 1].a )
258 break ;
259 }
260 if ( i < segCnt - 1 )
261 continue ;
262 }
263
264 break ;
265 }
266
267 return 1 ;
268 }
269
270
271 int GetChromId()
272 {
273 return b->core.tid ;
274 }
275
276 char* GetChromName( int tid )
277 {
278 return fpSam->header->target_name[ tid ] ;
279 }
280
281 int GetChromIdFromName( const char *s )
282 {
283 std::string ss( s ) ;
284 if ( chrNameToId.find( ss ) == chrNameToId.end() )
285 {
286 printf( "Unknown genome name: %s\n", s ) ;
287 exit( 1 ) ;
288 }
289 return chrNameToId[ss] ;
290 }
291
292 int GetChromLength( int tid )
293 {
294 return fpSam->header->target_len[ tid ] ;
295 }
296
297 int GetChromCount()
298 {
299 return fpSam->header->n_targets ;
300 }
301
302 void GetMatePosition( int &chrId, int64_t &pos )
303 {
304 if ( b->core.flag & 0x8 )
305 {
306 chrId = -1 ;
307 pos = -1 ;
308 }
309 else
310 {
311 chrId = b->core.mtid ;
312 pos = b->core.mpos ; //+ 1 ;
313 }
314 }
315
316 int GetRepeatPosition( int &chrId, int64_t &pos )
317 {
318 // Look at the CC field.
319 if ( !bam_aux_get( b, "CC" ) || !bam_aux_get( b, "CP" ) )
320 {
321 chrId = -1 ;
322 pos = -1 ;
323 return 0 ;
324 }
325
326 std::string s( bam_aux2Z( bam_aux_get(b, "CC" ) ) ) ;
327 chrId = chrNameToId[ s ] ;
328 pos = bam_aux2i( bam_aux_get( b, "CP" ) ) ;// Possible error for 64bit
329 return 1 ;
330 }
331
332 void GetFileName( char *buffer )
333 {
334 strcpy( buffer, fileName ) ;
335 }
336
337 int GetReadLength()
338 {
339 return b->core.l_qseq ;
340 }
341
342 int GetRefCoverLength()
343 {
344 return segmentsSum ;
345 }
346
347 bool IsFirstMate()
348 {
349 if ( b->core.flag & 0x40 )
350 return true ;
351 return false ;
352 }
353
354 bool IsReverse()
355 {
356 if ( b->core.flag & 0x10 )
357 return true ;
358 return false ;
359 }
360
361 bool IsMateReverse()
362 {
363 if ( b->core.flag & 0x20 )
364 return true ;
365 return false ;
366 }
367
368 char *GetReadId()
369 {
370 return bam1_qname( b ) ;
371 }
372
373 bool IsUnique()
374 {
375 if ( bam_aux_get( b, "NH" ) )
376 {
377 if ( bam_aux2i( bam_aux_get( b, "NH" ) ) > 1 )
378 return false ;
379 }
380 if ( IsSupplementary() && GetFieldZ( (char *)"XZ" ) != NULL )
381 return false ;
382 return true ;
383 }
384
385 bool IsPrimary()
386 {
387 if ( ( b->core.flag & 0x900 ) == 0 )
388 return true ;
389 else
390 return false ;
391 }
392
393 // -1:minus, 0: unknown, 1:plus
394 int GetStrand()
395 {
396 if ( segCnt == 1 )
397 return 0 ;
398 if ( bam_aux_get( b, "XS" ) )
399 {
400 if ( bam_aux2A( bam_aux_get( b, "XS" ) ) == '-' )
401 return -1 ;
402 else
403 return 1 ;
404 }
405 else
406 return 0 ;
407 }
408
409 int GetFieldI( char *f )
410 {
411 if ( bam_aux_get( b, f ) )
412 {
413 return bam_aux2i( bam_aux_get( b, f ) ) ;
414 }
415 return -1 ;
416 }
417
418 char *GetFieldZ( char *f )
419 {
420 if ( bam_aux_get( b, f ) )
421 {
422 return bam_aux2Z( bam_aux_get( b, f ) ) ;
423 }
424 return NULL ;
425 }
426
427 int GetNumberOfHits()
428 {
429 if ( bam_aux_get( b, "NH" ) )
430 {
431 return bam_aux2i( bam_aux_get( b, "NH" ) ) ;
432 }
433 return 1 ;
434 }
435
436 bool IsSupplementary()
437 {
438 if ( ( b->core.flag & 0x800 ) == 0 )
439 return false ;
440 else
441 return true ;
442 }
443
444 void SetAllowSupplementary( bool in )
445 {
446 allowSupplementary = in ;
447 }
448
449 void SetAllowClip( bool in )
450 {
451 allowClip = in ;
452 }
453
454 bool IsGCRich( bool threshold = 0.9 )
455 {
456 int i = 0 ;
457 int gcCnt = 0 ;
458 for ( i = 0 ; i < b->core.l_qseq ; ++i )
459 {
460 int bit = bam1_seqi( bam1_seq( b ), i ) ;
461 if ( bit == 2 || bit == 4 )
462 ++gcCnt ;
463 }
464 if ( gcCnt >= threshold * b->core.l_qseq )
465 return true ;
466 }
467
468 void GetGeneralInfo( bool stopEarly = false )
469 {
470 int i, k ;
471
472 const int sampleMax = 1000000 ;
473 int *lens = new int[sampleMax] ;
474 int *mateDiff = new int[sampleMax] ;
475 int lensCnt = 0 ;
476 int mateDiffCnt = 0 ;
477 bool end = false ;
478
479 while ( 1 )
480 {
481 while ( 1 )
482 {
483 if ( b )
484 bam_destroy1( b ) ;
485 b = bam_init1() ;
486
487 if ( samread( fpSam, b ) <= 0 )
488 {
489 end = true ;
490 break ;
491 }
492 if ( b->core.flag & 0xC )
493 continue ;
494
495 if ( ( b->core.flag & 0x900 ) == 0 )
496 break ;
497 }
498 if ( end )
499 break ;
500
501 if ( lensCnt < sampleMax )
502 {
503 lens[ lensCnt ] = b->core.l_qseq ;
504 ++lensCnt ;
505 }
506
507 if ( mateDiffCnt < sampleMax && b->core.tid == b->core.mtid
508 && b->core.pos < b->core.mpos )
509 {
510 mateDiff[ mateDiffCnt ] = b->core.mpos - b->core.pos ;
511 ++mateDiffCnt ;
512 }
513 ++totalReadCnt ;
514 if ( totalReadCnt >= sampleMax && stopEarly )
515 break ;
516 }
517
518 // Get the read length info and fragment length info
519 qsort( lens, lensCnt, sizeof( int ), CompInt ) ;
520 readLen = lens[ lensCnt - 1 ] ;
521
522 if ( mateDiffCnt > 0 )
523 {
524 matePaired = true ;
525
526 qsort( mateDiff, mateDiffCnt, sizeof( int ), CompInt ) ;
527 long long int sum = 0 ;
528 long long int sumsq = 0 ;
529
530 for ( i = 0 ; i < mateDiffCnt * 0.7 ; ++i )
531 {
532 sum += ( mateDiff[i] + readLen );
533 sumsq += ( mateDiff[i] + readLen ) * ( mateDiff[i] + readLen ) ;
534 }
535 k = i ;
536 fragLen = (int)( sum / k ) ;
537 fragStdev = (int)sqrt( sumsq / k - fragLen * fragLen ) ;
538 }
539 else
540 {
541 fragLen = readLen ;
542 fragStdev = 0 ;
543 }
544 //printf( "readLen = %d\nfragLen = %d, fragStdev = %d\n", readLen, fragLen, fragStdev ) ;
545 delete[] lens ;
546 delete[] mateDiff ;
547 }
548 } ;
549 #endif