Mercurial > repos > lsong10 > psiclass
diff PsiCLASS-1.0.2/alignments.hpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/alignments.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,549 @@ +// The class handles reading the bam file + +#ifndef _LSONG_RSCAF_ALIGNMENT_HEADER +#define _LSONG_RSCAF_ALIGNMENT_HEADER + +#include "samtools-0.1.19/sam.h" +#include <map> +#include <string> +#include <assert.h> +#include <iostream> +#include <stdlib.h> +#include <math.h> + +#include "defs.h" + +class Alignments +{ +private: + samfile_t *fpSam ; + bam1_t *b ; + + char fileName[1024] ; + bool opened ; + std::map<std::string, int> chrNameToId ; + bool allowSupplementary ; + bool allowClip ; + bool hasClipHead, hasClipTail ; + int segmentsSum ; // the sum of segments. + + bool atBegin ; + bool atEnd ; + + static int CompInt( const void *p1, const void *p2 ) + { + return (*(int *)p1 ) - (*(int *)p2 ) ; + } + + void Open() + { + fpSam = samopen( fileName, "rb", 0 ) ; + if ( !fpSam->header ) + { + fprintf( stderr, "Can not open %s.\n", fileName ) ; + exit( 1 ) ; + } + + // Collect the chromosome information + for ( int i = 0 ; i < fpSam->header->n_targets ; ++i ) + { + std::string s( fpSam->header->target_name[i] ) ; + chrNameToId[s] = i ; + } + opened = true ; + atBegin = true ; + atEnd = false ; + } +public: + struct _pair segments[MAX_SEG_COUNT] ; + int segCnt ; + + int totalReadCnt ; + int fragLen, fragStdev ; + int readLen ; + bool matePaired ; + + Alignments() + { + b = NULL ; + opened = false ; + atBegin = true ; + atEnd = false ; + allowSupplementary = false ; + allowClip = true ; + + totalReadCnt = 0 ; + fragLen = 0 ; + fragStdev = 0 ; + readLen = 0 ; + matePaired = false ; + } + ~Alignments() + { + if ( b ) + bam_destroy1( b ) ; + } + + void Open( char *file ) + { + strcpy( fileName, file ) ; + Open() ; + } + + void Rewind() + { + Close() ; + Open() ; + + atBegin = true ; + atEnd = false ; + } + + void Close() + { + samclose( fpSam ) ; + fpSam = NULL ; + } + + bool IsOpened() + { + return opened ; + } + + bool IsAtBegin() + { + return atBegin ; + } + + bool IsAtEnd() + { + return atEnd ; + } + + bool HasClip() + { + return hasClipHead || hasClipTail ; + } + + bool HasClipHead() + { + return hasClipHead ; + } + bool HasClipTail() + { + return hasClipTail ; + } + + int Next() + { + int i ; + int start = 0, len = 0 ; + uint32_t *rawCigar ; + + if ( atBegin == true ) + totalReadCnt = 0 ; + + atBegin = false ; + while ( 1 ) + { + while ( 1 ) + { + if ( b ) + bam_destroy1( b ) ; + b = bam_init1() ; + + if ( samread( fpSam, b ) <= 0 ) + { + atEnd = true ; + return 0 ; + } + + if ( ( b->core.flag & 0x900 ) == 0 ) + ++totalReadCnt ; + + if ( b->core.flag & 0xC ) + continue ; + + //if ( ( b->core.flag & 0x900 ) == 0 ) + break ; + } + + // to many repeat. + if ( bam_aux_get( b, "NH" ) ) + { + if ( bam_aux2i( bam_aux_get( b, "NH" ) ) >= 5 ) + continue ; + } + + // Compute the exons segments from the reads + segCnt = 0 ; + start = b->core.pos ; //+ 1 ; + rawCigar = bam1_cigar( b ) ; + // Check whether the query length is compatible with the read + if ( bam_cigar2qlen( &b->core, rawCigar ) != b->core.l_qseq ) + continue ; + + bool clipMiddle = false ; + int clipSum = 0 ; + hasClipHead = hasClipTail = false ; + len = 0 ; + segmentsSum = 0 ; + for ( i = 0 ; i < b->core.n_cigar ; ++i ) + { + int op = rawCigar[i] & BAM_CIGAR_MASK ; + int num = rawCigar[i] >> BAM_CIGAR_SHIFT ; + + switch ( op ) + { + case BAM_CMATCH: + case BAM_CDEL: + len += num ; break ; + case BAM_CSOFT_CLIP: + case BAM_CHARD_CLIP: + case BAM_CPAD: + { + if ( i == 0 ) + hasClipHead = true ; + else if ( i == b->core.n_cigar - 1 ) + hasClipTail = true ; + else + clipMiddle = true ; + + clipSum += num ; + } + case BAM_CINS: + num = 0 ; break ; + case BAM_CREF_SKIP: + { + segments[ segCnt ].a = start ; + segments[ segCnt ].b = start + len - 1 ; + ++segCnt ; + segmentsSum += len ; + start = start + len + num ; + len = 0 ; + } break ; + default: + len += num ; break ; + } + } + if ( clipMiddle ) // should never happend + continue ; + + if ( clipSum >= 2 && !allowClip ) + continue ; + + if ( len > 0 ) + { + segments[ segCnt ].a = start ; + segments[ segCnt ].b = start + len - 1 ; + ++segCnt ; + segmentsSum += len ; + } + /*if ( !strcmp( bam1_qname( b ), "chr1:109656301-109749401W:ENST00000490758.2:381:1480:1090:1290:X" ) ) + { + for ( i = 0 ; i < segCnt ; ++i ) + printf( "(%d %d) ", segments[i].a, segments[i].b ) ; + printf( "\n" ) ; + }*/ + + // Check whether the mates are compatible + //int mChrId = b->core.mtid ; + int64_t mPos = b->core.mpos ; + + if ( b->core.mtid == b->core.tid ) + { + for ( i = 0 ; i < segCnt - 1 ; ++i ) + { + if ( mPos >= segments[i].b && mPos <= segments[i + 1].a ) + break ; + } + if ( i < segCnt - 1 ) + continue ; + } + + break ; + } + + return 1 ; + } + + + int GetChromId() + { + return b->core.tid ; + } + + char* GetChromName( int tid ) + { + return fpSam->header->target_name[ tid ] ; + } + + int GetChromIdFromName( const char *s ) + { + std::string ss( s ) ; + if ( chrNameToId.find( ss ) == chrNameToId.end() ) + { + printf( "Unknown genome name: %s\n", s ) ; + exit( 1 ) ; + } + return chrNameToId[ss] ; + } + + int GetChromLength( int tid ) + { + return fpSam->header->target_len[ tid ] ; + } + + int GetChromCount() + { + return fpSam->header->n_targets ; + } + + void GetMatePosition( int &chrId, int64_t &pos ) + { + if ( b->core.flag & 0x8 ) + { + chrId = -1 ; + pos = -1 ; + } + else + { + chrId = b->core.mtid ; + pos = b->core.mpos ; //+ 1 ; + } + } + + int GetRepeatPosition( int &chrId, int64_t &pos ) + { + // Look at the CC field. + if ( !bam_aux_get( b, "CC" ) || !bam_aux_get( b, "CP" ) ) + { + chrId = -1 ; + pos = -1 ; + return 0 ; + } + + std::string s( bam_aux2Z( bam_aux_get(b, "CC" ) ) ) ; + chrId = chrNameToId[ s ] ; + pos = bam_aux2i( bam_aux_get( b, "CP" ) ) ;// Possible error for 64bit + return 1 ; + } + + void GetFileName( char *buffer ) + { + strcpy( buffer, fileName ) ; + } + + int GetReadLength() + { + return b->core.l_qseq ; + } + + int GetRefCoverLength() + { + return segmentsSum ; + } + + bool IsFirstMate() + { + if ( b->core.flag & 0x40 ) + return true ; + return false ; + } + + bool IsReverse() + { + if ( b->core.flag & 0x10 ) + return true ; + return false ; + } + + bool IsMateReverse() + { + if ( b->core.flag & 0x20 ) + return true ; + return false ; + } + + char *GetReadId() + { + return bam1_qname( b ) ; + } + + bool IsUnique() + { + if ( bam_aux_get( b, "NH" ) ) + { + if ( bam_aux2i( bam_aux_get( b, "NH" ) ) > 1 ) + return false ; + } + if ( IsSupplementary() && GetFieldZ( (char *)"XZ" ) != NULL ) + return false ; + return true ; + } + + bool IsPrimary() + { + if ( ( b->core.flag & 0x900 ) == 0 ) + return true ; + else + return false ; + } + + // -1:minus, 0: unknown, 1:plus + int GetStrand() + { + if ( segCnt == 1 ) + return 0 ; + if ( bam_aux_get( b, "XS" ) ) + { + if ( bam_aux2A( bam_aux_get( b, "XS" ) ) == '-' ) + return -1 ; + else + return 1 ; + } + else + return 0 ; + } + + int GetFieldI( char *f ) + { + if ( bam_aux_get( b, f ) ) + { + return bam_aux2i( bam_aux_get( b, f ) ) ; + } + return -1 ; + } + + char *GetFieldZ( char *f ) + { + if ( bam_aux_get( b, f ) ) + { + return bam_aux2Z( bam_aux_get( b, f ) ) ; + } + return NULL ; + } + + int GetNumberOfHits() + { + if ( bam_aux_get( b, "NH" ) ) + { + return bam_aux2i( bam_aux_get( b, "NH" ) ) ; + } + return 1 ; + } + + bool IsSupplementary() + { + if ( ( b->core.flag & 0x800 ) == 0 ) + return false ; + else + return true ; + } + + void SetAllowSupplementary( bool in ) + { + allowSupplementary = in ; + } + + void SetAllowClip( bool in ) + { + allowClip = in ; + } + + bool IsGCRich( bool threshold = 0.9 ) + { + int i = 0 ; + int gcCnt = 0 ; + for ( i = 0 ; i < b->core.l_qseq ; ++i ) + { + int bit = bam1_seqi( bam1_seq( b ), i ) ; + if ( bit == 2 || bit == 4 ) + ++gcCnt ; + } + if ( gcCnt >= threshold * b->core.l_qseq ) + return true ; + } + + void GetGeneralInfo( bool stopEarly = false ) + { + int i, k ; + + const int sampleMax = 1000000 ; + int *lens = new int[sampleMax] ; + int *mateDiff = new int[sampleMax] ; + int lensCnt = 0 ; + int mateDiffCnt = 0 ; + bool end = false ; + + while ( 1 ) + { + while ( 1 ) + { + if ( b ) + bam_destroy1( b ) ; + b = bam_init1() ; + + if ( samread( fpSam, b ) <= 0 ) + { + end = true ; + break ; + } + if ( b->core.flag & 0xC ) + continue ; + + if ( ( b->core.flag & 0x900 ) == 0 ) + break ; + } + if ( end ) + break ; + + if ( lensCnt < sampleMax ) + { + lens[ lensCnt ] = b->core.l_qseq ; + ++lensCnt ; + } + + if ( mateDiffCnt < sampleMax && b->core.tid == b->core.mtid + && b->core.pos < b->core.mpos ) + { + mateDiff[ mateDiffCnt ] = b->core.mpos - b->core.pos ; + ++mateDiffCnt ; + } + ++totalReadCnt ; + if ( totalReadCnt >= sampleMax && stopEarly ) + break ; + } + + // Get the read length info and fragment length info + qsort( lens, lensCnt, sizeof( int ), CompInt ) ; + readLen = lens[ lensCnt - 1 ] ; + + if ( mateDiffCnt > 0 ) + { + matePaired = true ; + + qsort( mateDiff, mateDiffCnt, sizeof( int ), CompInt ) ; + long long int sum = 0 ; + long long int sumsq = 0 ; + + for ( i = 0 ; i < mateDiffCnt * 0.7 ; ++i ) + { + sum += ( mateDiff[i] + readLen ); + sumsq += ( mateDiff[i] + readLen ) * ( mateDiff[i] + readLen ) ; + } + k = i ; + fragLen = (int)( sum / k ) ; + fragStdev = (int)sqrt( sumsq / k - fragLen * fragLen ) ; + } + else + { + fragLen = readLen ; + fragStdev = 0 ; + } + //printf( "readLen = %d\nfragLen = %d, fragStdev = %d\n", readLen, fragLen, fragStdev ) ; + delete[] lens ; + delete[] mateDiff ; + } +} ; +#endif