Mercurial > repos > lsong10 > psiclass
view PsiCLASS-1.0.2/AddXS.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
line wrap: on
line source
#include <stdio.h> #include <stdint.h> #include <string.h> #include <stdlib.h> #include <vector> #include <map> #include <fstream> char usage[] = "./addXS ref.fa [OPTIONS] < in.sam > out.sam\n" "From bam to bam: samtools view -h in.bam | ./addXS ref.fa | samtools view -bS - > out.bam\n" ; char samLine[65537] ; char seq[65537] ; char nucToNum[26] = { 0, -1, 1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, 0, // Regard 'N' as 'A' -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1 } ; char numToNuc[26] = {'A', 'C', 'G', 'T'} ; struct _pair { int a, b ; } ; class BitSequence { private: int len ; //std::vector<uint32_t> sequence ; uint32_t *sequence ; public: BitSequence() { len = 0 ; sequence = NULL ;} BitSequence( int l ) { len = 0 ; sequence = new uint32_t[ l / 16 + 1 ] ; } ~BitSequence() { //if ( sequence != NULL ) // delete[] sequence ; } void SetLength( int l ) { len = 0 ; if ( sequence != NULL ) delete[] sequence ; sequence = new uint32_t[ l / 16 + 1 ] ; } int GetLength() { return len ; } void Append( char c ) { if ( ( len & 15 ) == 0 ) { sequence[ len / 16 ] = 0 ; } ++len ; //printf( "%d %c\n", len, c ) ; Set( c, len - 1 ) ; } // pos is 0-based coordinate // notice that the order within one 32 bit butcket is reversed void Set( char c, int pos ) { if ( pos >= len ) return ; if ( c >= 'a' && c <= 'z' ) { c = c - 'a' + 'A' ; } if ( c == 'N' ) c = 'A' ; int ind = pos >> 4 ; int offset = pos & 15 ; int mask = ( (int)( nucToNum[c - 'A'] & 3 ) ) << ( 2 * offset ) ; sequence[ind] = sequence[ind] | mask ; //if ( c != 'A' ) // printf( "%d: %c %c %d %d : %d\n", pos, c, Get(pos), ind, offset, mask ) ; //Print() ; } char Get( int pos ) { if ( pos >= len ) return 'N' ; int ind = pos >> 4 ; int offset = pos & 15 ; //printf( "%d: %d\n", pos, sequence[ind] ) ; return numToNuc[ ( ( sequence[ind] >> ( 2 * offset ) ) & 3 ) ] ; } void Print() { int i ; for ( i = 0 ; i < len ; ++i ) printf( "%c", Get( i ) ) ; printf( "\n" ) ; } void Print( FILE *fp, int start, int end, bool rc ) { if ( !rc ) { for ( int i = start ; i <= end ; ++i ) fprintf( fp, "%c", Get( i ) ) ; } else { for ( int i = end ; i >= start ; --i ) { char c = Get( i ) ; if ( c == 'A' ) c = 'T' ; else if ( c == 'C' ) c = 'G' ; else if ( c == 'G' ) c = 'C' ; else //if ( c == 'T' ) c = 'A' ; fprintf( fp, "%c", c ) ; } } } } ; void ReverseComplement( char *s, int l ) { int u, v ; for ( u = 0, v = l - 1 ; u <= v ; ++u, --v ) { char tmp ; tmp = s[v] ; s[v] = s[u] ; s[u] = tmp ; s[u] = numToNuc[ 3 - nucToNum[ s[u] - 'A' ] ] ; s[v] = numToNuc[ 3 - nucToNum[ s[v] - 'A' ] ] ; } } int main( int argc, char *argv[] ) { int i, j, k ; std::vector<BitSequence> genome ; std::map<std::string, int> chrToChrId ; char readid[200], chrom[50], mapq[10], cigar[1000], mateChrom[50] ; char buffer[1024] ; char pattern[1024] ; int start, mstart, flag, tlen ; // read start and mate read start int seqLen ; int chrCnt = 0 ; int chrId ; int width = 7 ; int score ; // 0-bit: multiple aligned, 1-bit can shift the intron, 2-bit can shift the alignment, 3-bit contain sequencing error if ( argc < 2 ) { printf( "%s", usage ) ; return 0 ; } std::ifstream fpRef ; fpRef.open( argv[1] ) ; std::string line ; int motifStrand[1024] ; /*motifStrand[ 0b10110010 ] = 1 ; // GT/AG motifStrand[ 0b01110001 ] = 0 ; // CT/AC motifStrand[ 0b10010010 ] = 1 ;// GC/AG motifStrand[ 0b01111001 ] = 0 ; // CT/GC motifStrand[ 0b00110001 ] = 1 ; // AT/AC motifStrand[ 0b10110011 ] = 0 ; // GT/AT*/ memset( motifStrand, -1, sizeof( motifStrand )) ; motifStrand[ 0xb2 ] = 1 ; // GT/AG motifStrand[ 0x71 ] = 0 ; // CT/AC motifStrand[ 0x92 ] = 1 ;// GC/AG motifStrand[ 0x79 ] = 0 ; // CT/GC motifStrand[ 0x31 ] = 1 ; // AT/AC motifStrand[ 0xb3 ] = 0 ; // GT/AT k = 0 ; while ( getline( fpRef, line ) ) { //printf( "%s\n", line.c_str() ) ; int len = line.length() ; if ( line[0] == '>' ) { //char *s = strdup( line.c_str() + 1 ) ; if ( chrCnt > 0 ) { genome[ chrCnt - 1 ].SetLength( k ) ; } for ( i = 1 ; i < len ; ++i ) if ( line[i] == ' ' || line[i] == '\t' ) break ; chrToChrId[ line.substr( 1, i - 1 ) ] = chrCnt ; ++chrCnt ; BitSequence bs ; genome.push_back( bs ) ; k = 0 ; } else { k += len ; } } genome[ chrCnt - 1 ].SetLength( k ) ; fpRef.close() ; fpRef.open( argv[1] ) ; while ( getline( fpRef, line ) ) { //printf( "%s\n", line.c_str() ) ; int len = line.length() ; if ( line[0] == '>' ) { //char *s = strdup( line.c_str() + 1 ) ; for ( i = 1 ; i < len ; ++i ) if ( line[i] == ' ' || line[i] == '\t' ) break ; chrId = chrToChrId[ line.substr( 1, i - 1 ) ] ; } else { BitSequence &bs = genome[chrId] ; for ( i = 0 ; i < len ; ++i ) { if ( ( line[i] >= 'A' && line[i] <= 'Z' ) || ( line[i] >= 'a' && line[i] <= 'z' ) ) bs.Append( line[i] ) ; } } } fpRef.close() ; while ( fgets( samLine, sizeof( samLine ), stdin ) != NULL ) { if ( samLine[0] == '@' ) { printf( "%s", samLine ) ; continue ; } sscanf( samLine, "%s %d %s %d %s %s %s %d %d %s", readid, &flag, chrom, &start, mapq, cigar, mateChrom, &mstart, &tlen, seq ) ; struct _pair segments[1000] ; struct _pair seqSegments[1000] ; // the segments with respect to the sequence. int segCnt = 0 ; int seqSegCnt = 0 ; int num = 0 ; int len = 0 ; score = 0 ; for ( i = 0 ; cigar[i] ; ++i ) { if ( cigar[i] >= '0' && cigar[i] <= '9' ) num = num * 10 + cigar[i] - '0' ; else if ( cigar[i] == 'I' || cigar[i] == 'S' || cigar[i] == 'H' || cigar[i] == 'P' ) { num = 0 ; } else if ( cigar[i] == 'N' ) { segments[segCnt].a = start ; segments[segCnt].b = start + len - 1 ; ++segCnt ; start = start + len + num ; len = 0 ; num = 0 ; } else { len += num ; num = 0 ; } } if ( len > 0 ) { segments[segCnt].a = start ; segments[segCnt].b = start + len - 1 ; ++segCnt ; } if ( segCnt <= 1 || strstr( samLine, "XS:A:" ) != NULL ) { printf( "%s", samLine ) ; continue ; } std::string schr( chrom ) ; int chrId = chrToChrId[schr] ; int strand = -1 ; len = strlen( samLine ) ; samLine[len - 1] = '\0' ; int motif = 0 ; BitSequence &chrSeq = genome[ chrId ] ; for ( i = 0 ; i < segCnt - 1 ; ++i ) { char m[4] ; m[0] = chrSeq.Get( segments[i].b + 1 - 1 ) ; m[1] = chrSeq.Get( segments[i].b + 2 - 1 ) ; m[2] = chrSeq.Get( segments[i + 1].a - 2 - 1 ) ; m[3] = chrSeq.Get( segments[i + 1].a - 1 - 1 ) ; motif = 0 ; for ( j = 0 ; j < 4 ; ++j ) motif = ( motif << 2 ) + ( nucToNum[ m[j] - 'A' ] & 3 ); strand = motifStrand[ motif ] ; if ( strand != -1 ) break ; } if ( strand == 1 ) { printf( "%s\tXS:A:+\n", samLine ) ; continue ; } else if ( strand == 0 ) { printf( "%s\tXS:A:-\n", samLine ) ; continue ; } char *p ; if ( ( p = strstr( samLine, "NH:i:" ) ) != NULL ) { p += 5 ; num = 0 ; while ( *p >= '0' && *p <= '9' ) { num = num * 10 + *p - '0' ; ++p ; } if ( num > 1 ) score |= 1 ; } seqLen = strlen( seq ) ; for ( i = 0 ; i < seqLen ; ++i ) if ( seq[i] == 'N' ) { score |= 1 ; break ; } num = 0 ; len = 0 ; seqSegCnt = 0 ; start = 0 ; for ( i = 0 ; cigar[i] ; ++i ) { if ( cigar[i] >= '0' && cigar[i] <= '9' ) num = num * 10 + cigar[i] - '0' ; else if ( cigar[i] == 'D' || cigar[i] == 'S' || cigar[i] == 'H' || cigar[i] == 'P' ) { num = 0 ; } else if ( cigar[i] == 'N' ) { seqSegments[seqSegCnt].a = start ; seqSegments[seqSegCnt].b = start + len - 1 ; ++seqSegCnt ; start = start + len ; len = 0 ; num = 0 ; } else { len += num ; num = 0 ; } } if ( len > 0 ) { seqSegments[seqSegCnt].a = start ; seqSegments[seqSegCnt].b = start + len - 1 ; ++seqSegCnt ; } // Check whether we can shift the intron to a canonical splice site for ( i = 0 ; i < segCnt - 1 ; ++i ) { for ( j = -width ; j < width ; ++j ) { if ( segments[i].b + j < segments[i].a || segments[i+1].a + j > segments[i+1].b ) continue ; // Look for the splice signal. char m[4] ; m[0] = chrSeq.Get( segments[i].b + j + 1 - 1 ) ; m[1] = chrSeq.Get( segments[i].b + j + 2 - 1 ) ; m[2] = chrSeq.Get( segments[i + 1].a + j - 2 - 1 ) ; m[3] = chrSeq.Get( segments[i + 1].a + j - 1 - 1 ) ; motif = 0 ; for ( k = 0 ; k < 4 ; ++k ) motif = ( motif << 2 ) + ( nucToNum[ m[k] - 'A' ] & 3 ); strand = motifStrand[ motif ] ; if ( strand == -1 ) continue ; //printf( "%d %d: %d\n", i, j, motif ) ; // We found a signal, then test whether we can shift the intron. // Extract the sequence from the reference genome. int l = 0 ; if ( j < 0 ) for ( k = segments[i + 1].a + j, l = 0 ; k <= segments[i + 1].a - 1 ; ++k, ++l ) buffer[l] = chrSeq.Get( k - 1 ) ; else for ( k = segments[i].b + 1, l= 0 ; k <= segments[i].b + j ; ++k, ++l ) buffer[l] = chrSeq.Get(k - 1) ; buffer[l] = '\0' ; //printf( "%s\n", buffer ) ; // Get the coordinate on the sequence. int tag ; int useBegin = 0 ; // is the portion from the beginning part of a seqSegment? int from, to ; if ( j < 0 ) { tag = i ; useBegin = 0 ; } else // j>0 { tag = i + 1 ; useBegin = 1 ; } //printf( "%d %d\n", tag, useBegin ) ; if ( ( flag & 0x10 ) != 0 ) { //TODO: it seems star already fliped the sequence. // is it true for other aligners? //tag = segCnt - 1 - tag ; //useBegin = 1 - useBegin ; //ReverseComplement( buffer, l ) ; } if ( useBegin ) { from = seqSegments[tag].a ; to = seqSegments[tag].a + l - 1 ; } else { from = seqSegments[tag].b - l + 1 ; to = seqSegments[tag].b ; } //printf( "%d %d %d\n", tag, from, to ) ; for ( k = 0 ; k < l ; ++k ) { if ( seq[from + k] != buffer[k] ) break ; } if ( k >= l ) { // We can shift the intron score |= 2 ; break ; } } if ( j < width ) break ; } // Check whether the sequence is low-complexity. To do this, we test whether we can shift the seqSegments and directly inspect the sequence. for ( i = 0 ; i < seqSegCnt ; ++i ) { int l ; for ( k = 0, j = seqSegments[i].a - width ; j <= seqSegments[i].b + width ; ++j, ++k ) buffer[k] = chrSeq.Get( j ) ; buffer[k] = '0' ; for ( l = 0, j = seqSegments[i].a ; j <= seqSegments[i].b ; ++j, ++l ) { pattern[l] = seq[j] ; } pattern[l] = '\0' ; // It seems the aligner already flipped the read in its output? //if ( flag & 0x10 != 0 ) // ReverseComplement( pattern, l ) ; char *p = buffer ; int cnt = 0 ; while ( ( p = strstr( p, pattern ) ) != NULL ) { ++cnt ; ++p ; } if ( cnt > 1 ) { score |= 4 ; break ; } int count[5] = { 0, 0, 0, 0, 0 } ; int max = 0 ; for ( j = 0 ; j < l ; ++j ) { ++count[ nucToNum[ pattern[j] - 'A' ] ] ; } for ( j = 0 ; j < 5 ; ++j ) { if ( count[j] > max ) max = count[j] ; } if ( max > 0.8 * l ) { score |= 4 ; break ; } } if ( ( p = strstr( samLine, "NM:i:" ) ) != NULL || ( p = strstr( samLine, "nM:i:" ) ) != NULL ) { int nm = atoi( p + 5 ) ; if ( nm > 0 ) { score |= 8 ; } } else { // TODO: check the nm by ourself } // Check whether the alignment is concordant. if ( ( flag & 0x1 ) != 0 && ( flag & 0x4 ) == 0 ) { start = segments[0].a ; if ( strcmp( mateChrom, "=" ) || ( flag & 0x10 ) == ( flag & 0x20 ) || ( ( flag & 0x10 ) == 0 && ( mstart < start || mstart > start + 2000000 ) ) || ( ( flag & 0x10 ) != 0 && ( mstart > start || mstart < start - 2000000 ) ) ) { score |= 16 ; } } printf( "%s\tXS:A:?\tYS:i:%d\n", samLine, score ) ; } return 0 ; }