Mercurial > repos > lsong10 > psiclass
diff PsiCLASS-1.0.2/AddXS.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/AddXS.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,583 @@ +#include <stdio.h> +#include <stdint.h> + +#include <string.h> +#include <stdlib.h> +#include <vector> +#include <map> +#include <fstream> + +char usage[] = + "./addXS ref.fa [OPTIONS] < in.sam > out.sam\n" + "From bam to bam: samtools view -h in.bam | ./addXS ref.fa | samtools view -bS - > out.bam\n" ; +char samLine[65537] ; +char seq[65537] ; + +char nucToNum[26] = { 0, -1, 1, -1, -1, -1, 2, + -1, -1, -1, -1, -1, -1, 0, // Regard 'N' as 'A' + -1, -1, -1, -1, -1, 3, + -1, -1, -1, -1, -1, -1 } ; +char numToNuc[26] = {'A', 'C', 'G', 'T'} ; + +struct _pair +{ + int a, b ; +} ; + +class BitSequence +{ +private: + int len ; + //std::vector<uint32_t> sequence ; + uint32_t *sequence ; + +public: + BitSequence() { len = 0 ; sequence = NULL ;} + BitSequence( int l ) + { + len = 0 ; + sequence = new uint32_t[ l / 16 + 1 ] ; + } + + ~BitSequence() + { + //if ( sequence != NULL ) + // delete[] sequence ; + } + + void SetLength( int l ) + { + len = 0 ; + if ( sequence != NULL ) + delete[] sequence ; + sequence = new uint32_t[ l / 16 + 1 ] ; + } + + int GetLength() + { + return len ; + } + + void Append( char c ) + { + if ( ( len & 15 ) == 0 ) + { + sequence[ len / 16 ] = 0 ; + } + ++len ; + //printf( "%d %c\n", len, c ) ; + Set( c, len - 1 ) ; + } + + // pos is 0-based coordinate + // notice that the order within one 32 bit butcket is reversed + void Set( char c, int pos ) + { + if ( pos >= len ) + return ; + + if ( c >= 'a' && c <= 'z' ) + { + c = c - 'a' + 'A' ; + } + if ( c == 'N' ) + c = 'A' ; + + int ind = pos >> 4 ; + int offset = pos & 15 ; + int mask = ( (int)( nucToNum[c - 'A'] & 3 ) ) << ( 2 * offset ) ; + sequence[ind] = sequence[ind] | mask ; + //if ( c != 'A' ) + // printf( "%d: %c %c %d %d : %d\n", pos, c, Get(pos), ind, offset, mask ) ; + //Print() ; + } + + char Get( int pos ) + { + if ( pos >= len ) + return 'N' ; + + int ind = pos >> 4 ; + int offset = pos & 15 ; + //printf( "%d: %d\n", pos, sequence[ind] ) ; + return numToNuc[ ( ( sequence[ind] >> ( 2 * offset ) ) & 3 ) ] ; + } + + void Print() + { + int i ; + for ( i = 0 ; i < len ; ++i ) + printf( "%c", Get( i ) ) ; + printf( "\n" ) ; + } + + void Print( FILE *fp, int start, int end, bool rc ) + { + if ( !rc ) + { + for ( int i = start ; i <= end ; ++i ) + fprintf( fp, "%c", Get( i ) ) ; + } + else + { + for ( int i = end ; i >= start ; --i ) + { + char c = Get( i ) ; + if ( c == 'A' ) + c = 'T' ; + else if ( c == 'C' ) + c = 'G' ; + else if ( c == 'G' ) + c = 'C' ; + else //if ( c == 'T' ) + c = 'A' ; + fprintf( fp, "%c", c ) ; + } + } + } +} ; + +void ReverseComplement( char *s, int l ) +{ + int u, v ; + for ( u = 0, v = l - 1 ; u <= v ; ++u, --v ) + { + char tmp ; + tmp = s[v] ; + s[v] = s[u] ; + s[u] = tmp ; + + s[u] = numToNuc[ 3 - nucToNum[ s[u] - 'A' ] ] ; + s[v] = numToNuc[ 3 - nucToNum[ s[v] - 'A' ] ] ; + } +} + + +int main( int argc, char *argv[] ) +{ + int i, j, k ; + + std::vector<BitSequence> genome ; + std::map<std::string, int> chrToChrId ; + + char readid[200], chrom[50], mapq[10], cigar[1000], mateChrom[50] ; + char buffer[1024] ; + char pattern[1024] ; + int start, mstart, flag, tlen ; // read start and mate read start + int seqLen ; + int chrCnt = 0 ; + int chrId ; + int width = 7 ; + int score ; // 0-bit: multiple aligned, 1-bit can shift the intron, 2-bit can shift the alignment, 3-bit contain sequencing error + + if ( argc < 2 ) + { + printf( "%s", usage ) ; + return 0 ; + } + + + std::ifstream fpRef ; + fpRef.open( argv[1] ) ; + std::string line ; + + int motifStrand[1024] ; + + /*motifStrand[ 0b10110010 ] = 1 ; // GT/AG + motifStrand[ 0b01110001 ] = 0 ; // CT/AC + motifStrand[ 0b10010010 ] = 1 ;// GC/AG + motifStrand[ 0b01111001 ] = 0 ; // CT/GC + motifStrand[ 0b00110001 ] = 1 ; // AT/AC + motifStrand[ 0b10110011 ] = 0 ; // GT/AT*/ + memset( motifStrand, -1, sizeof( motifStrand )) ; + motifStrand[ 0xb2 ] = 1 ; // GT/AG + motifStrand[ 0x71 ] = 0 ; // CT/AC + motifStrand[ 0x92 ] = 1 ;// GC/AG + motifStrand[ 0x79 ] = 0 ; // CT/GC + motifStrand[ 0x31 ] = 1 ; // AT/AC + motifStrand[ 0xb3 ] = 0 ; // GT/AT + + k = 0 ; + while ( getline( fpRef, line ) ) + { + //printf( "%s\n", line.c_str() ) ; + int len = line.length() ; + if ( line[0] == '>' ) + { + //char *s = strdup( line.c_str() + 1 ) ; + if ( chrCnt > 0 ) + { + genome[ chrCnt - 1 ].SetLength( k ) ; + } + for ( i = 1 ; i < len ; ++i ) + if ( line[i] == ' ' || line[i] == '\t' ) + break ; + chrToChrId[ line.substr( 1, i - 1 ) ] = chrCnt ; + ++chrCnt ; + + BitSequence bs ; + genome.push_back( bs ) ; + + k = 0 ; + } + else + { + k += len ; + } + } + genome[ chrCnt - 1 ].SetLength( k ) ; + fpRef.close() ; + + fpRef.open( argv[1] ) ; + while ( getline( fpRef, line ) ) + { + //printf( "%s\n", line.c_str() ) ; + int len = line.length() ; + if ( line[0] == '>' ) + { + //char *s = strdup( line.c_str() + 1 ) ; + for ( i = 1 ; i < len ; ++i ) + if ( line[i] == ' ' || line[i] == '\t' ) + break ; + chrId = chrToChrId[ line.substr( 1, i - 1 ) ] ; + } + else + { + BitSequence &bs = genome[chrId] ; + for ( i = 0 ; i < len ; ++i ) + { + if ( ( line[i] >= 'A' && line[i] <= 'Z' ) || + ( line[i] >= 'a' && line[i] <= 'z' ) ) + bs.Append( line[i] ) ; + } + } + } + fpRef.close() ; + + + + while ( fgets( samLine, sizeof( samLine ), stdin ) != NULL ) + { + if ( samLine[0] == '@' ) + { + printf( "%s", samLine ) ; + continue ; + } + + sscanf( samLine, "%s %d %s %d %s %s %s %d %d %s", readid, &flag, chrom, &start, mapq, cigar, mateChrom, &mstart, &tlen, seq ) ; + struct _pair segments[1000] ; + struct _pair seqSegments[1000] ; // the segments with respect to the sequence. + int segCnt = 0 ; + int seqSegCnt = 0 ; + int num = 0 ; + int len = 0 ; + + score = 0 ; + + for ( i = 0 ; cigar[i] ; ++i ) + { + if ( cigar[i] >= '0' && cigar[i] <= '9' ) + num = num * 10 + cigar[i] - '0' ; + else if ( cigar[i] == 'I' || cigar[i] == 'S' || cigar[i] == 'H' + || cigar[i] == 'P' ) + { + num = 0 ; + } + else if ( cigar[i] == 'N' ) + { + segments[segCnt].a = start ; + segments[segCnt].b = start + len - 1 ; + ++segCnt ; + + start = start + len + num ; + len = 0 ; + num = 0 ; + } + else + { + len += num ; + num = 0 ; + } + } + + + if ( len > 0 ) + { + segments[segCnt].a = start ; + segments[segCnt].b = start + len - 1 ; + ++segCnt ; + } + + if ( segCnt <= 1 || strstr( samLine, "XS:A:" ) != NULL ) + { + printf( "%s", samLine ) ; + continue ; + } + + std::string schr( chrom ) ; + int chrId = chrToChrId[schr] ; + int strand = -1 ; + + len = strlen( samLine ) ; + samLine[len - 1] = '\0' ; + int motif = 0 ; + + BitSequence &chrSeq = genome[ chrId ] ; + + for ( i = 0 ; i < segCnt - 1 ; ++i ) + { + char m[4] ; + m[0] = chrSeq.Get( segments[i].b + 1 - 1 ) ; + m[1] = chrSeq.Get( segments[i].b + 2 - 1 ) ; + m[2] = chrSeq.Get( segments[i + 1].a - 2 - 1 ) ; + m[3] = chrSeq.Get( segments[i + 1].a - 1 - 1 ) ; + motif = 0 ; + for ( j = 0 ; j < 4 ; ++j ) + motif = ( motif << 2 ) + ( nucToNum[ m[j] - 'A' ] & 3 ); + strand = motifStrand[ motif ] ; + if ( strand != -1 ) + break ; + } + if ( strand == 1 ) + { + printf( "%s\tXS:A:+\n", samLine ) ; + continue ; + } + else if ( strand == 0 ) + { + printf( "%s\tXS:A:-\n", samLine ) ; + continue ; + } + + char *p ; + if ( ( p = strstr( samLine, "NH:i:" ) ) != NULL ) + { + p += 5 ; + num = 0 ; + while ( *p >= '0' && *p <= '9' ) + { + num = num * 10 + *p - '0' ; + ++p ; + } + if ( num > 1 ) + score |= 1 ; + } + + seqLen = strlen( seq ) ; + for ( i = 0 ; i < seqLen ; ++i ) + if ( seq[i] == 'N' ) + { + score |= 1 ; + break ; + } + + num = 0 ; + len = 0 ; + seqSegCnt = 0 ; + start = 0 ; + for ( i = 0 ; cigar[i] ; ++i ) + { + if ( cigar[i] >= '0' && cigar[i] <= '9' ) + num = num * 10 + cigar[i] - '0' ; + else if ( cigar[i] == 'D' || cigar[i] == 'S' || cigar[i] == 'H' + || cigar[i] == 'P' ) + { + num = 0 ; + } + else if ( cigar[i] == 'N' ) + { + seqSegments[seqSegCnt].a = start ; + seqSegments[seqSegCnt].b = start + len - 1 ; + ++seqSegCnt ; + + start = start + len ; + len = 0 ; + num = 0 ; + } + else + { + len += num ; + num = 0 ; + } + } + if ( len > 0 ) + { + seqSegments[seqSegCnt].a = start ; + seqSegments[seqSegCnt].b = start + len - 1 ; + ++seqSegCnt ; + } + + // Check whether we can shift the intron to a canonical splice site + for ( i = 0 ; i < segCnt - 1 ; ++i ) + { + for ( j = -width ; j < width ; ++j ) + { + if ( segments[i].b + j < segments[i].a || segments[i+1].a + j > segments[i+1].b ) + continue ; + // Look for the splice signal. + char m[4] ; + m[0] = chrSeq.Get( segments[i].b + j + 1 - 1 ) ; + m[1] = chrSeq.Get( segments[i].b + j + 2 - 1 ) ; + m[2] = chrSeq.Get( segments[i + 1].a + j - 2 - 1 ) ; + m[3] = chrSeq.Get( segments[i + 1].a + j - 1 - 1 ) ; + motif = 0 ; + for ( k = 0 ; k < 4 ; ++k ) + motif = ( motif << 2 ) + ( nucToNum[ m[k] - 'A' ] & 3 ); + strand = motifStrand[ motif ] ; + if ( strand == -1 ) + continue ; + //printf( "%d %d: %d\n", i, j, motif ) ; + + // We found a signal, then test whether we can shift the intron. + // Extract the sequence from the reference genome. + int l = 0 ; + if ( j < 0 ) + for ( k = segments[i + 1].a + j, l = 0 ; k <= segments[i + 1].a - 1 ; ++k, ++l ) + buffer[l] = chrSeq.Get( k - 1 ) ; + else + for ( k = segments[i].b + 1, l= 0 ; k <= segments[i].b + j ; ++k, ++l ) + buffer[l] = chrSeq.Get(k - 1) ; + buffer[l] = '\0' ; + + //printf( "%s\n", buffer ) ; + + // Get the coordinate on the sequence. + int tag ; + int useBegin = 0 ; // is the portion from the beginning part of a seqSegment? + int from, to ; + if ( j < 0 ) + { + tag = i ; + useBegin = 0 ; + } + else // j>0 + { + tag = i + 1 ; + useBegin = 1 ; + } + //printf( "%d %d\n", tag, useBegin ) ; + + if ( ( flag & 0x10 ) != 0 ) + { + //TODO: it seems star already fliped the sequence. + // is it true for other aligners? + + //tag = segCnt - 1 - tag ; + //useBegin = 1 - useBegin ; + + + //ReverseComplement( buffer, l ) ; + } + + if ( useBegin ) + { + from = seqSegments[tag].a ; + to = seqSegments[tag].a + l - 1 ; + } + else + { + from = seqSegments[tag].b - l + 1 ; + to = seqSegments[tag].b ; + } + //printf( "%d %d %d\n", tag, from, to ) ; + + for ( k = 0 ; k < l ; ++k ) + { + if ( seq[from + k] != buffer[k] ) + break ; + } + + + if ( k >= l ) + { + // We can shift the intron + score |= 2 ; + break ; + } + } + if ( j < width ) + break ; + } + + // Check whether the sequence is low-complexity. To do this, we test whether we can shift the seqSegments and directly inspect the sequence. + for ( i = 0 ; i < seqSegCnt ; ++i ) + { + int l ; + for ( k = 0, j = seqSegments[i].a - width ; j <= seqSegments[i].b + width ; ++j, ++k ) + buffer[k] = chrSeq.Get( j ) ; + buffer[k] = '0' ; + + for ( l = 0, j = seqSegments[i].a ; j <= seqSegments[i].b ; ++j, ++l ) + { + pattern[l] = seq[j] ; + } + pattern[l] = '\0' ; + + // It seems the aligner already flipped the read in its output? + //if ( flag & 0x10 != 0 ) + // ReverseComplement( pattern, l ) ; + + + char *p = buffer ; + int cnt = 0 ; + while ( ( p = strstr( p, pattern ) ) != NULL ) + { + ++cnt ; + ++p ; + } + if ( cnt > 1 ) + { + score |= 4 ; + break ; + } + + int count[5] = { 0, 0, 0, 0, 0 } ; + int max = 0 ; + for ( j = 0 ; j < l ; ++j ) + { + ++count[ nucToNum[ pattern[j] - 'A' ] ] ; + } + + for ( j = 0 ; j < 5 ; ++j ) + { + if ( count[j] > max ) + max = count[j] ; + } + if ( max > 0.8 * l ) + { + score |= 4 ; + break ; + } + } + + if ( ( p = strstr( samLine, "NM:i:" ) ) != NULL || ( p = strstr( samLine, "nM:i:" ) ) != NULL ) + { + int nm = atoi( p + 5 ) ; + if ( nm > 0 ) + { + score |= 8 ; + } + } + else + { + // TODO: check the nm by ourself + } + + // Check whether the alignment is concordant. + if ( ( flag & 0x1 ) != 0 && ( flag & 0x4 ) == 0 ) + { + start = segments[0].a ; + if ( strcmp( mateChrom, "=" ) + || ( flag & 0x10 ) == ( flag & 0x20 ) + || ( ( flag & 0x10 ) == 0 && ( mstart < start || mstart > start + 2000000 ) ) + || ( ( flag & 0x10 ) != 0 && ( mstart > start || mstart < start - 2000000 ) ) ) + { + score |= 16 ; + } + } + + printf( "%s\tXS:A:?\tYS:i:%d\n", samLine, score ) ; + } + + return 0 ; +}