view PsiCLASS-1.0.2/alignments.hpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
line wrap: on
line source

// The class handles reading the bam file

#ifndef _LSONG_RSCAF_ALIGNMENT_HEADER
#define _LSONG_RSCAF_ALIGNMENT_HEADER

#include "samtools-0.1.19/sam.h"
#include <map>
#include <string>
#include <assert.h>
#include <iostream>
#include <stdlib.h>
#include <math.h>

#include "defs.h"

class Alignments
{
private:
	samfile_t *fpSam ;	
	bam1_t *b ;

	char fileName[1024] ;
	bool opened ;	
	std::map<std::string, int> chrNameToId ;
	bool allowSupplementary ;
	bool allowClip ;
	bool hasClipHead, hasClipTail ;
	int segmentsSum ; // the sum of segments.

	bool atBegin ;
	bool atEnd ;

	static int CompInt( const void *p1, const void *p2 )
	{
		return (*(int *)p1 ) - (*(int *)p2 ) ;
	}

	void Open()
	{
		fpSam = samopen( fileName, "rb", 0 ) ;
		if ( !fpSam->header )
		{
			fprintf( stderr, "Can not open %s.\n", fileName ) ;
			exit( 1 ) ;
		}

		// Collect the chromosome information
		for ( int i = 0 ; i < fpSam->header->n_targets ; ++i )		
		{
			std::string s( fpSam->header->target_name[i] ) ;
			chrNameToId[s] = i ;
		}
		opened = true ;
		atBegin = true ;
		atEnd = false ;
	}
public:
	struct _pair segments[MAX_SEG_COUNT] ;		
	int segCnt ;

	int totalReadCnt ;
	int fragLen, fragStdev ;
	int readLen ;
	bool matePaired ;
	
	Alignments() 
	{ 
		b = NULL ; 
		opened = false ; 
		atBegin = true ;
		atEnd = false ;
		allowSupplementary = false ;
		allowClip = true ;

		totalReadCnt = 0 ;
		fragLen = 0 ;
		fragStdev = 0 ;
		readLen = 0 ;
		matePaired = false ;
	}
	~Alignments() 
	{
		if ( b )
			bam_destroy1( b ) ;
	}

	void Open( char *file )
	{
		strcpy( fileName, file ) ;
		Open() ;
	}

	void Rewind()
	{
		Close() ;
		Open() ;

		atBegin = true ;
		atEnd = false ;
	}

	void Close()
	{
		samclose( fpSam ) ;
		fpSam = NULL ;
	}

	bool IsOpened()
	{
		return opened ;
	}

	bool IsAtBegin()
	{
		return atBegin ;
	}

	bool IsAtEnd()
	{
		return atEnd ;
	}

	bool HasClip()
	{
		return hasClipHead || hasClipTail ;
	}

	bool HasClipHead()
	{
		return hasClipHead ;
	}
	bool HasClipTail()
	{
		return hasClipTail ;
	}

	int Next()
	{
		int i ;
		int start = 0, len = 0 ;
		uint32_t *rawCigar ;

		if ( atBegin == true )
			totalReadCnt = 0 ;

		atBegin = false ;
		while ( 1 )
		{
			while ( 1 )
			{
				if ( b )
					bam_destroy1( b ) ;
				b = bam_init1() ;

				if ( samread( fpSam, b ) <= 0 )
				{
					atEnd = true ;
					return 0 ;
				}

				if ( ( b->core.flag & 0x900 ) == 0 )
					++totalReadCnt ;
				
				if ( b->core.flag & 0xC )
					continue ;

				//if ( ( b->core.flag & 0x900 ) == 0 )
					break ;
			}
			
			// to many repeat.
			if ( bam_aux_get( b, "NH" ) )
			{
				if ( bam_aux2i( bam_aux_get( b, "NH" ) ) >= 5 )
					continue ;
			}

			// Compute the exons segments from the reads
			segCnt = 0 ;
			start = b->core.pos ; //+ 1 ;
			rawCigar = bam1_cigar( b ) ; 
			// Check whether the query length is compatible with the read
			if ( bam_cigar2qlen( &b->core, rawCigar ) != b->core.l_qseq ) 
				continue ;

			bool clipMiddle = false ;
			int clipSum = 0 ;
			hasClipHead = hasClipTail = false ;
			len = 0 ;
			segmentsSum = 0 ;
			for ( i = 0 ; i < b->core.n_cigar ; ++i )
			{
				int op = rawCigar[i] & BAM_CIGAR_MASK ;
				int num = rawCigar[i] >> BAM_CIGAR_SHIFT ;

				switch ( op )
				{
					case BAM_CMATCH:
					case BAM_CDEL:
						len += num ; break ;
					case BAM_CSOFT_CLIP:
					case BAM_CHARD_CLIP:
					case BAM_CPAD:
					{
						if ( i == 0 )
							hasClipHead = true ;
						else if ( i == b->core.n_cigar - 1 )
							hasClipTail = true ;
						else 
							clipMiddle = true ;
					
						clipSum += num ;
					}
					case BAM_CINS:
						num = 0 ; break ;
					case BAM_CREF_SKIP:
						{
							segments[ segCnt ].a = start ;
							segments[ segCnt ].b = start + len - 1 ;
							++segCnt ;
							segmentsSum += len ;
							start = start + len + num ;
							len = 0 ;
						} break ;
					default:
						len += num ; break ;
				}
			}
			if ( clipMiddle ) // should never happend
				continue ;

			if ( clipSum >= 2 && !allowClip )
				continue ;

			if ( len > 0 )
			{
				segments[ segCnt ].a = start ;
				segments[ segCnt ].b = start + len - 1 ;
				++segCnt ;
				segmentsSum += len ;
			}
			/*if ( !strcmp( bam1_qname( b ), "chr1:109656301-109749401W:ENST00000490758.2:381:1480:1090:1290:X" ) )
			{
				for ( i = 0 ; i < segCnt ; ++i )
					printf( "(%d %d) ", segments[i].a, segments[i].b ) ;
				printf( "\n" ) ;
			}*/
			
			// Check whether the mates are compatible
			//int mChrId = b->core.mtid ;
			int64_t mPos = b->core.mpos ;

			if ( b->core.mtid == b->core.tid )
			{
				for ( i = 0 ; i < segCnt - 1 ; ++i )
				{
					if ( mPos >= segments[i].b && mPos <= segments[i + 1].a )
						break ;
				}
				if ( i < segCnt - 1 )
					continue ;
			}
			
			break ;
		}

		return 1 ;
	}


	int GetChromId()
	{
		return b->core.tid ; 
	}

	char* GetChromName( int tid )
	{
		return fpSam->header->target_name[ tid ] ; 
	}

	int GetChromIdFromName( const char *s )
	{
		std::string ss( s ) ;
		if ( chrNameToId.find( ss ) == chrNameToId.end() )
		{
			printf( "Unknown genome name: %s\n", s ) ;
			exit( 1 ) ;
		}
		return chrNameToId[ss] ;
	}

	int GetChromLength( int tid )
	{
		return fpSam->header->target_len[ tid ] ;
	}

	int GetChromCount()
	{
		return fpSam->header->n_targets ;
	}

	void GetMatePosition( int &chrId, int64_t &pos )
	{
		if ( b->core.flag & 0x8 )
		{
			chrId = -1 ;
			pos = -1 ;
		}
		else
		{
			chrId = b->core.mtid ;
			pos = b->core.mpos ; //+ 1 ;
		}
	}

	int GetRepeatPosition( int &chrId, int64_t &pos )
	{
		// Look at the CC field.
		if ( !bam_aux_get( b, "CC" ) || !bam_aux_get( b, "CP" ) )
		{
			chrId = -1 ;
			pos = -1 ;
			return 0 ;
		}
		
		std::string s( bam_aux2Z( bam_aux_get(b, "CC" ) ) ) ;
		chrId = chrNameToId[ s ] ;
		pos = bam_aux2i( bam_aux_get( b, "CP" ) ) ;// Possible error for 64bit	
		return 1 ;
	}

	void GetFileName( char *buffer )
	{
		strcpy( buffer, fileName ) ;
	}

	int GetReadLength()
	{
		return b->core.l_qseq ;
	}

	int GetRefCoverLength()
	{
		return segmentsSum ;
	}

	bool IsFirstMate()
	{
		if ( b->core.flag & 0x40 )
			return true ;
		return false ;
	}

	bool IsReverse()
	{
		if ( b->core.flag & 0x10 )	
			return true ;
		return false ;
	}

	bool IsMateReverse()
	{
		if ( b->core.flag & 0x20 )
			return true ;
		return false ;
	}

	char *GetReadId()
	{
		return bam1_qname( b ) ;
	}

	bool IsUnique()
	{
		if ( bam_aux_get( b, "NH" ) )
		{
			if ( bam_aux2i( bam_aux_get( b, "NH" ) ) > 1 )
				return false ;
		}	
		if ( IsSupplementary() && GetFieldZ( (char *)"XZ" ) != NULL )
			return false ;
		return true ;
	}

	bool IsPrimary()
	{
		if ( ( b->core.flag & 0x900 ) == 0 )
			return true ;
		else
			return false ;
	}
	
	// -1:minus, 0: unknown, 1:plus
	int GetStrand()
	{
		if ( segCnt == 1 )
			return 0 ;
		if ( bam_aux_get( b, "XS" ) )
		{
			if ( bam_aux2A( bam_aux_get( b, "XS" ) ) == '-' )
				return -1 ;	
			else
				return 1 ;
		}
		else
			return 0 ;
	}

	int GetFieldI( char *f )
	{
		if ( bam_aux_get( b, f ) )
		{
			return bam_aux2i( bam_aux_get( b, f ) ) ;
		}
		return -1 ;
	}

	char *GetFieldZ( char *f )
	{
		if ( bam_aux_get( b, f ) )
		{
			return bam_aux2Z( bam_aux_get( b, f ) ) ;
		}
		return NULL ;
	}
	
	int GetNumberOfHits()
	{
		if ( bam_aux_get( b, "NH" ) )
		{
			return bam_aux2i( bam_aux_get( b, "NH" ) ) ;
		}	
		return 1 ;
	}
	
	bool IsSupplementary()
	{
		if ( ( b->core.flag & 0x800 ) == 0 )
			return false ;
		else
			return true ;
	}

	void SetAllowSupplementary( bool in )
	{ 
		allowSupplementary = in ;
	}

	void SetAllowClip( bool in )
	{
		allowClip = in ;
	}

	bool IsGCRich( bool threshold = 0.9 )
	{
		int i = 0 ;
		int gcCnt = 0 ;
		for ( i = 0 ; i < b->core.l_qseq ; ++i )
		{
			int bit = bam1_seqi( bam1_seq( b ), i ) ;
			if ( bit == 2 || bit == 4 )
				++gcCnt ;		
		}
		if ( gcCnt >= threshold * b->core.l_qseq )
			return true ;
	}

	void GetGeneralInfo( bool stopEarly = false )
	{
		int i, k ;

		const int sampleMax = 1000000 ;
		int *lens = new int[sampleMax] ;
		int *mateDiff = new int[sampleMax] ;
		int lensCnt = 0 ;
		int mateDiffCnt = 0 ;
		bool end = false ;
		
		while ( 1 )
		{
			while ( 1 )
			{
				if ( b )
					bam_destroy1( b ) ;
				b = bam_init1() ;

				if ( samread( fpSam, b ) <= 0 )
				{
					end = true ;
					break ;
				}
				if ( b->core.flag & 0xC )
					continue ;

				if ( ( b->core.flag & 0x900 ) == 0 )
					break ;
			}
			if ( end )
				break ;
			
			if ( lensCnt < sampleMax )
			{
				lens[ lensCnt ] = b->core.l_qseq ; 
				++lensCnt ;
			}

			if ( mateDiffCnt < sampleMax && b->core.tid == b->core.mtid 
				&& b->core.pos < b->core.mpos )
			{
				mateDiff[ mateDiffCnt ] = b->core.mpos - b->core.pos ;
				++mateDiffCnt ;
			}
			++totalReadCnt ; 
			if ( totalReadCnt >= sampleMax && stopEarly )
				break ;
		}

		// Get the read length info and fragment length info
		qsort( lens, lensCnt, sizeof( int ), CompInt ) ;
		readLen = lens[ lensCnt - 1 ] ;
		
		if ( mateDiffCnt > 0 )
		{
			matePaired = true ;

			qsort( mateDiff, mateDiffCnt, sizeof( int ), CompInt ) ;
			long long int sum = 0 ;
			long long int sumsq = 0 ;
			
			for ( i = 0 ; i < mateDiffCnt * 0.7 ; ++i )
			{
				sum += ( mateDiff[i] + readLen );
				sumsq += ( mateDiff[i] + readLen ) * ( mateDiff[i] + readLen ) ;
			}
			k = i ;
			fragLen = (int)( sum / k ) ;
			fragStdev = (int)sqrt( sumsq / k - fragLen * fragLen ) ;
		}
		else
		{
			fragLen = readLen ;
			fragStdev = 0 ;
		}
		//printf( "readLen = %d\nfragLen = %d, fragStdev = %d\n", readLen, fragLen, fragStdev ) ;		
		delete[] lens ;
		delete[] mateDiff ;
	}
} ;
#endif