Mercurial > repos > lsong10 > psiclass
diff PsiCLASS-1.0.2/CombineSubexons.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/CombineSubexons.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,1680 @@ +#include <stdio.h> +#include <string.h> + +#include <algorithm> +#include <vector> +#include <map> +#include <string> + +#include "alignments.hpp" +#include "blocks.hpp" +#include "stats.hpp" +#include "SubexonGraph.hpp" + +char usage[] = "combineSubexons [options]\n" + "Required options:\n" + "\t-s STRING: the path to the predicted subexon information. Can use multiple -s to specify multiple subexon prediction files\n" + "\t\tor\n" + "\t--ls STRING: the path to file of the list of the predicted subexon information.\n" ; + +struct _overhang +{ + int cnt ; // the number of samples support this subexon. + int validCnt ; // The number of samples that are used for compute probability. + int length ; + double classifier ; +} ; + +struct _intronicInfo +{ + int chrId ; + int start, end ; + int leftSubexonIdx, rightSubexonIdx ; + double irClassifier ; + int irCnt ; + int validIrCnt ; + struct _overhang leftOverhang, rightOverhang ; // leftOverhangClassifier is for the overhang subexon at the left side of this intron. +} ; + +struct _seInterval +{ + int chrId ; + int start, end ; + int type ; // 0-subexon, 1-intronicInfo + int idx ; +} ; + +struct _subexonSplit +{ + int chrId ; + int pos ; + int type ; //1-start of a subexon. 2-end of a subexon + int splitType ; //0-soft boundary, 1-start of an exon, 2-end of an exon. + int strand ; + + int weight ; +} ; + +struct _interval // exon or intron +{ + int chrId ; + int start, end ; + int strand ; + int sampleSupport ; +} ; + +struct _subexonSupplement // supplement the subexon structure defined in SubexonGraph. +{ + int *nextSupport ; + int *prevSupport ; +} ; + +char buffer[4096] ; + +bool CompSubexonSplit( struct _subexonSplit a, struct _subexonSplit b ) +{ + if ( a.chrId < b.chrId ) + return true ; + else if ( a.chrId > b.chrId ) + return false ; + else if ( a.pos != b.pos ) + return a.pos < b.pos ; + else if ( a.type != b.type ) + { + // the split site with no strand information should come first. + /*if ( a.splitType != b.splitType ) + { + if ( a.splitType == 0 ) + return true ; + else if ( b.splitType == 0 ) + return false ; + }*/ + return a.type < b.type ; + } + else if ( a.splitType != b.splitType ) + { + //return a.splitType < b.splitType ; + if ( a.splitType == 0 ) + return true ; + else if ( b.splitType == 0 ) + return false ; + + if ( a.type == 1 ) + return a.splitType > b.splitType ; + else + return a.splitType < b.splitType ; + } + + return false ; +} + +bool CompInterval( struct _interval a, struct _interval b ) +{ + if ( a.chrId < b.chrId ) + return true ; + else if ( a.chrId > b.chrId ) + return false ; + else if ( a.start != b.start ) + return a.start < b.start ; + else if ( a.end != b.end ) + return a.end < b.end ; + return false ; +} + +bool CompSeInterval( struct _seInterval a, struct _seInterval b ) +{ + if ( a.chrId < b.chrId ) + return true ; + else if ( a.chrId > b.chrId ) + return false ; + else if ( a.start < b.start ) + return true ; + else if ( a.start > b.start ) + return false ; + else if ( a.end < b.end ) + return true ; + else + return false ; +} + +// Keep this the same as in SubexonInfo.cpp. +double TransformCov( double c ) +{ + double ret ; + //return sqrt( c ) - 1 ; + + if ( c <= 2 + 1e-6 ) + ret = 1e-6 ; + else + ret = c - 2 ; + + return ret ; +} + +double GetUpdateMixtureGammaClassifier( double ratio, double cov, double piRatio, double kRatio[2], double thetaRatio[2], + double piCov, double kCov[2], double thetaCov[2], bool conservative ) +{ + double p1 = 0, p2 ; + + cov = TransformCov( cov ) ; + if ( cov < ( kCov[0] - 1 ) * thetaCov[0] ) + cov = ( kCov[0] - 1 ) * thetaCov[0] ; + + if ( ratio > 0 ) + p1 = MixtureGammaAssignment( ratio, piRatio, kRatio, thetaRatio ) ; + // Make sure cov > 1? + p2 = MixtureGammaAssignment( cov, piCov, kCov, thetaCov ) ; + double ret = 0 ; + + if ( conservative ) + { + if ( p1 >= p2 ) // we should use ratio. + ret = LogGammaDensity( ratio, kRatio[1], thetaRatio[1] ) + - LogGammaDensity( ratio, kRatio[0], thetaRatio[0] ) ; + else + ret = LogGammaDensity( cov, kCov[1], thetaCov[1] ) + - LogGammaDensity( cov, kCov[0], thetaCov[0] ) ; + } + else + { + if ( p1 >= p2 ) // we should use ratio. + ret = LogGammaDensity( ratio, kRatio[1], thetaRatio[1] ) + - LogGammaDensity( ratio, kRatio[0], thetaRatio[0] ) ; + else + ret = LogGammaDensity( cov, kCov[1], thetaCov[1] ) + - LogGammaDensity( cov, kCov[0], thetaCov[0] ) ; + } + return ret ; +} + +double GetPValueOfGeneEnd( double cov ) +{ + if ( cov <= 2.0 ) + return 1.0 ; + double tmp = 2.0 * ( sqrt( cov ) - log( cov ) ) ; + if ( tmp <= 0 ) + return 1.0 ; + return 2.0 * alnorm( tmp, true ) ; +} + +char StrandNumToSymbol( int strand ) +{ + if ( strand > 0 ) + return '+' ; + else if ( strand < 0 ) + return '-' ; + else + return '.' ; +} + +int StrandSymbolToNum( char c ) +{ + if ( c == '+' ) + return 1 ; + else if ( c == '-' ) + return -1 ; + else + return 0 ; +} + +int *MergePositions( int *old, int ocnt, int *add, int acnt, int &newCnt ) //, int **support ) +{ + int i, j, k ; + int *ret ; + if ( acnt == 0 ) + { + newCnt = ocnt ; + return old ; + } + if ( ocnt == 0 ) + { + newCnt = acnt ; + ret = new int[acnt] ; + //*support = new int[acnt] ; + for ( i = 0 ; i < acnt ; ++i ) + { + //(*support)[i] = 1 ; + ret[i] = add[i] ; + } + return ret ; + } + newCnt = 0 ; + for ( i = 0, j = 0 ; i < ocnt && j < acnt ; ) + { + if ( old[i] < add[j] ) + { + ++i ; + ++newCnt ; + } + else if ( old[i] == add[j] ) + { + ++i ; ++j ; + ++newCnt ; + } + else + { + ++j ; + ++newCnt ; + } + } + newCnt = newCnt + ( ocnt - i ) + ( acnt - j ) ; + // no new elements. + if ( newCnt == ocnt ) + { + /*i = 0 ; + for ( j = 0 ; j < acnt ; ++j ) + { + for ( ; old[i] < add[j] ; ++i ) + ; + ++(*support)[i] ; + }*/ + return old ; + } + k = 0 ; + //delete []old ; + ret = new int[ newCnt ] ; + //int *bufferSupport = new int[newCnt] ; + for ( i = 0, j = 0 ; i < ocnt && j < acnt ; ) + { + if ( old[i] < add[j] ) + { + ret[k] = old[i] ; + //bufferSupport[k] = (*support)[i] ; + ++i ; + ++k ; + } + else if ( old[i] == add[j] ) + { + ret[k] = old[i] ; + //bufferSupport[k] = (*support)[i] + 1 ; + ++i ; ++j ; + ++k ; + } + else + { + ret[k] = add[j] ; + //bufferSupport[k] = 1 ; + ++j ; + ++k ; + } + } + for ( ; i < ocnt ; ++i, ++k ) + { + ret[k] = old[i] ; + //bufferSupport[k] = (*support)[i] ; + } + for ( ; j < acnt ; ++j, ++k ) + { + ret[k] = add[j] ; + //bufferSupport[k] = 1 ; + } + delete[] old ; + //delete[] *support ; + //*support = bufferSupport ; + return ret ; +} + +void CoalesceSubexonSplits( std::vector<struct _subexonSplit> &splits, int mid ) +{ + int i, j, k ; + int cnt = splits.size() ; + //std::sort( splits.begin(), splits.end(), CompSubexonSplit ) ; + + std::vector<struct _subexonSplit> sortedSplits ; + sortedSplits.resize( cnt ) ; + + k = 0 ; + for ( i = 0, j = mid ; i < mid && j < cnt ; ++k ) + { + if ( CompSubexonSplit( splits[i], splits[j] ) ) + { + sortedSplits[k] = splits[i] ; + ++i ; + } + else + { + sortedSplits[k] = splits[j] ; + ++j ; + } + } + for ( ; i < mid ; ++i, ++k ) + sortedSplits[k] = splits[i] ; + for ( ; j < cnt ; ++j, ++k ) + sortedSplits[k] = splits[j] ; + splits = sortedSplits ; + + k = 0 ; + for ( i = 1 ; i < cnt ; ++i ) + { + if ( splits[i].chrId == splits[k].chrId && splits[i].pos == splits[k].pos && splits[i].type == splits[k].type && splits[i].splitType == splits[k].splitType + && splits[i].strand == splits[k].strand ) + { + splits[k].weight += splits[i].weight ; + } + else + { + ++k ; + splits[k] = splits[i] ; + } + } + splits.resize( k + 1 ) ; +} + +void CoalesceDifferentStrandSubexonSplits( std::vector<struct _subexonSplit> &splits ) +{ + int i, j, k, l ; + int cnt = splits.size() ; + k = 0 ; + for ( i = 0 ; i < cnt ; ) + { + for ( j = i + 1 ; j < cnt ; ++j ) + { + if ( splits[i].chrId == splits[j].chrId && splits[i].pos == splits[j].pos && splits[i].type == splits[j].type && splits[i].splitType == splits[j].splitType ) + continue ; + break ; + } + + int maxWeight = -1 ; + int weightSum = 0 ; + int strand = splits[i].strand ; + for ( l = i ; l < j ; ++l ) + { + weightSum += splits[l].weight ; + if ( splits[l].strand != 0 && splits[l].weight > maxWeight ) + { + strand = splits[l].strand ; + maxWeight = splits[l].weight ; + } + } + //printf( "%d: %d %d %d\n", splits[i].pos, i, j, strand ) ; + splits[k] = splits[i] ; + splits[k].strand = strand ; + splits[k].weight = weightSum ; + ++k ; + + i = j ; + } + splits.resize( k ) ; +} + + +void CoalesceIntervals( std::vector<struct _interval> &intervals ) +{ + int i, k ; + std::sort( intervals.begin(), intervals.end(), CompInterval ) ; + int cnt = intervals.size() ; + k = 0 ; + for ( i = 1 ; i < cnt ; ++i ) + { + if ( intervals[i].chrId == intervals[k].chrId && intervals[i].start == intervals[k].start && intervals[i].end == intervals[k].end ) + intervals[k].sampleSupport += intervals[i].sampleSupport ; + else + { + ++k ; + intervals[k] = intervals[i] ; + } + } + intervals.resize( k + 1 ) ; +} + +void CleanIntervalIrOverhang( std::vector<struct _interval> &irOverhang ) +{ + int i, j, k ; + std::sort( irOverhang.begin(), irOverhang.end(), CompInterval ) ; + + int cnt = irOverhang.size() ; + for ( i = 0 ; i < cnt ; ++i ) + { + if ( irOverhang[i].start == -1 ) + continue ; + + + // locate the longest interval start at the same coordinate. + int tag = i ; + + for ( j = i + 1 ; j < cnt ; ++j ) + { + if ( irOverhang[j].chrId != irOverhang[i].chrId || irOverhang[j].start != irOverhang[i].start ) + break ; + if ( irOverhang[j].start == -1 ) + continue ; + tag = j ; + } + + for ( k = i ; k < tag ; ++k ) + { + irOverhang[k].start = -1 ; + } + + for ( k = tag + 1 ; k < cnt ; ++k ) + { + if ( irOverhang[k].chrId != irOverhang[tag].chrId || irOverhang[k].start > irOverhang[tag].end ) + break ; + if ( irOverhang[k].end <= irOverhang[tag].end ) + { + irOverhang[k].start = -1 ; + } + } + } + + k = 0 ; + for ( i = 0 ; i < cnt ; ++i ) + { + if ( irOverhang[i].start == -1 ) + continue ; + irOverhang[k] = irOverhang[i] ; + ++k ; + } + irOverhang.resize( k ) ; +} + +// Remove the connection that does not match the boundary +// of subexons. +void CleanUpSubexonConnections( std::vector<struct _subexon> &subexons ) +{ + int seCnt = subexons.size() ; + int i, j, k, m ; + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( subexons[i].prevCnt > 0 ) + { + for ( k = i ; k >= 0 ; --k ) + if ( subexons[k].chrId != subexons[i].chrId || subexons[k].end <= subexons[i].prev[0] ) + break ; + if ( subexons[k].chrId != subexons[i].chrId ) + ++k ; + m = 0 ; + for ( j = 0 ; j < subexons[i].prevCnt ; ++j ) + { + for ( ; k <= i ; ++k ) + if ( subexons[k].end >= subexons[i].prev[j] ) + break ; + if ( subexons[k].end == subexons[i].prev[j] + && ( SubexonGraph::IsSameStrand( subexons[k].rightStrand, subexons[i].leftStrand ) || subexons[k].end + 1 == subexons[i].start ) ) + { + subexons[i].prev[m] = subexons[i].prev[j] ; + ++m ; + } + } + subexons[i].prevCnt = m ; + } + + m = 0 ; + k = i ; + for ( j = 0 ; j < subexons[i].nextCnt ; ++j ) + { + for ( ; k < seCnt ; ++k ) + if ( subexons[k].chrId != subexons[i].chrId || subexons[k].start >= subexons[i].next[j] ) + break ; + if ( subexons[k].start == subexons[i].next[j] + && ( SubexonGraph::IsSameStrand( subexons[i].rightStrand, subexons[k].leftStrand ) || subexons[i].end + 1 == subexons[k].start ) ) + { + subexons[i].next[m] = subexons[i].next[j] ; + ++m ; + } + } + subexons[i].nextCnt = m ; + } +} + +int main( int argc, char *argv[] ) +{ + int i, j, k ; + FILE *fp ; + std::vector<char *> files ; + + Blocks regions ; + Alignments alignments ; + + if ( argc == 1 ) + { + printf( "%s", usage ) ; + return 0 ; + } + + for ( i = 1 ; i < argc ; ++i ) + { + if ( !strcmp( argv[i], "-s" ) ) + { + files.push_back( argv[i + 1] ) ; + ++i ; + continue ; + } + else if ( !strcmp( argv[i], "--ls" ) ) + { + FILE *fpLs = fopen( argv[i + 1], "r" ) ; + char buffer[1024] ; + while ( fgets( buffer, sizeof( buffer ), fpLs ) != NULL ) + { + int len = strlen( buffer ) ; + if ( buffer[len - 1] == '\n' ) + { + buffer[len - 1] = '\0' ; + --len ; + + } + char *fileName = strdup( buffer ) ; + files.push_back( fileName ) ; + } + } + } + int fileCnt = files.size() ; + // Obtain the chromosome ids through bam file. + fp = fopen( files[0], "r" ) ; + if ( fgets( buffer, sizeof( buffer ), fp ) != NULL ) + { + int len = strlen( buffer ) ; + buffer[len - 1] = '\0' ; + alignments.Open( buffer + 1 ) ; + } + fclose( fp ) ; + + // Collect the split sites of subexons. + std::vector<struct _subexonSplit> subexonSplits ; + std::vector<struct _interval> intervalIrOverhang ; // intervals contains ir and overhang. + std::vector<struct _interval> introns ; + std::vector<struct _interval> exons ; + + + for ( k = 0 ; k < fileCnt ; ++k ) + { + fp = fopen( files[k], "r" ) ; + struct _subexon se ; + struct _subexonSplit sp ; + char chrName[50] ; + int origSize = subexonSplits.size() ; + while ( fgets( buffer, sizeof( buffer), fp ) != NULL ) + { + if ( buffer[0] == '#' ) + continue ; + + SubexonGraph::InputSubexon( buffer, alignments, se, false) ; + // Record all the intron rentention, overhang from the samples + if ( ( se.leftType == 2 && se.rightType == 1 ) + || ( se.leftType == 2 && se.rightType == 0 ) + || ( se.leftType == 0 && se.rightType == 1 ) ) + { + struct _interval si ; + si.chrId = se.chrId ; + si.start = se.start ; + si.end = se.end ; + + intervalIrOverhang.push_back( si ) ; + } + + // Ignore overhang subexons and ir subexons for now. + if ( ( se.leftType == 0 && se.rightType == 1 ) + || ( se.leftType == 2 && se.rightType == 0 ) + || ( se.leftType == 2 && se.rightType == 1 ) ) + continue ; + + if ( se.leftType == 0 && se.rightType == 0 && ( se.leftClassifier == -1 || se.leftClassifier == 1 ) ) // ignore noisy single-exon island + continue ; + if ( se.leftType == 0 && se.rightType == 0 && ( fileCnt >= 10 && se.leftClassifier > 0.99 ) ) + continue ; + + if ( se.leftType == 1 && se.rightType == 2 ) // a full exon, we allow mixtured strand here. + { + struct _interval ni ; + ni.chrId = se.chrId ; + ni.start = se.start ; + ni.end = se.end ; + ni.strand = se.rightStrand ; + ni.sampleSupport = 1 ; + exons.push_back( ni ) ; + } + + + /*for ( i = 0 ; i < se.nextCnt ; ++i ) + { + struct _interval ni ; + ni.chrId = se.chrId ; + ni.start = se.end ; + ni.end = se.next[i] ; + ni.strand = se.rightStrand ; + ni.sampleSupport = 1 ; + if ( ni.start + 1 < ni.end ) + introns.push_back( ni ) ; + }*/ + + sp.chrId = se.chrId ; + sp.pos = se.start ; + sp.type = 1 ; + sp.splitType = se.leftType ; + sp.strand = se.leftStrand ; + sp.weight = 1 ; + subexonSplits.push_back( sp ) ; + + sp.chrId = se.chrId ; + sp.pos = se.end ; + sp.type = 2 ; + sp.splitType = se.rightType ; + sp.strand = se.rightStrand ; + sp.weight = 1 ; + subexonSplits.push_back( sp ) ; + + /*if ( se.prevCnt > 0 ) + delete[] se.prev ; + if ( se.nextCnt > 0 ) + delete[] se.next ;*/ + } + CoalesceIntervals( exons ) ; + CoalesceIntervals( introns ) ; + CoalesceSubexonSplits( subexonSplits, origSize ) ; + CleanIntervalIrOverhang( intervalIrOverhang ) ; + + fclose( fp ) ; + } + + CoalesceDifferentStrandSubexonSplits( subexonSplits ) ; + + // Obtain the split sites from the introns. + int intronCnt = introns.size() ; + std::vector<struct _subexonSplit> intronSplits ; + for ( i = 0 ; i < intronCnt ; ++i ) + { + /*if ( introns[i].sampleSupport < 0.05 * fileCnt ) + { + continue ; + }*/ + struct _interval &it = introns[i] ; + struct _subexonSplit sp ; + sp.chrId = it.chrId ; + sp.pos = it.start ; + sp.type = 2 ; + sp.splitType = 2 ; + sp.strand = it.strand ; + intronSplits.push_back( sp ) ; + + sp.chrId = it.chrId ; + sp.pos = it.end ; + sp.type = 1 ; + sp.splitType = 1 ; + sp.strand = it.strand ; + intronSplits.push_back( sp ) ; + } + + // Pair up the split sites to get subexons + std::sort( intronSplits.begin(), intronSplits.end(), CompSubexonSplit ) ; + //std::sort( subexonSplits.begin(), subexonSplits.end(), CompSubexonSplit ) ; + + // Convert the hard boundary to soft boundary if the split sites is filtered from the introns + // Seems NO need to do this now. + int splitCnt = subexonSplits.size() ; + int intronSplitCnt = intronSplits.size() ; + k = 0 ; + //for ( i = 0 ; i < splitCnt ; ++i ) + while ( 0 ) + { + if ( subexonSplits[i].type != subexonSplits[i].splitType ) + continue ; + + while ( k < intronSplitCnt && ( intronSplits[k].chrId < subexonSplits[i].chrId + || ( intronSplits[k].chrId == subexonSplits[i].chrId && intronSplits[k].pos < subexonSplits[i].pos ) ) ) + ++k ; + j = k ; + while ( j < intronSplitCnt && intronSplits[j].chrId == subexonSplits[i].chrId + && intronSplits[j].pos == subexonSplits[i].pos && intronSplits[j].splitType != subexonSplits[i].splitType ) + ++j ; + + // the split site is filtered. + if ( j >= intronSplitCnt || intronSplits[j].chrId != subexonSplits[i].chrId || + intronSplits[j].pos > subexonSplits[i].pos ) + { + //printf( "%d %d. %d %d\n", subexonSplits[i].pos, intronSplits[j].pos, intronSplits[j].chrId , subexonSplits[i].chrId ) ; + subexonSplits[i].splitType = 0 ; + + // Convert the adjacent subexon split. + for ( int l = i + 1 ; i < splitCnt && subexonSplits[l].chrId == subexonSplits[i].chrId + && subexonSplits[l].pos == subexonSplits[i].pos + 1 ; ++l ) + { + if ( subexonSplits[l].type != subexonSplits[i].type + && subexonSplits[l].splitType == subexonSplits[i].splitType ) + { + subexonSplits[l].splitType = 0 ; + } + } + + // And the other direction + for ( int l = i - 1 ; l >= 0 && subexonSplits[l].chrId == subexonSplits[i].chrId + && subexonSplits[l].pos == subexonSplits[i].pos - 1 ; --l ) + { + if ( subexonSplits[l].type != subexonSplits[i].type + && subexonSplits[l].splitType == subexonSplits[i].splitType ) + { + subexonSplits[l].splitType = 0 ; + } + } + } + } + intronSplits.clear() ; + std::vector<struct _subexonSplit>().swap( intronSplits ) ; + + // Force the soft boundary that collides with hard boundaries to be hard boundary. + for ( i = 0 ; i < splitCnt ; ++i ) + { + if ( subexonSplits[i].splitType != 0 ) + continue ; + int newSplitType = 0 ; + int newStrand = subexonSplits[i].strand ; + for ( j = i + 1 ; j < splitCnt ; ++j ) + { + if ( subexonSplits[i].type != subexonSplits[j].type || subexonSplits[i].pos != subexonSplits[j].pos || + subexonSplits[i].chrId != subexonSplits[j].chrId ) + break ; + if ( subexonSplits[j].splitType != 0 ) + { + newSplitType = subexonSplits[j].splitType ; + newStrand = subexonSplits[j].strand ; + break ; + } + } + + if ( newSplitType == 0 ) + { + for ( j = i - 1 ; j >= 0 ; --j ) + { + if ( subexonSplits[i].type != subexonSplits[j].type || subexonSplits[i].pos != subexonSplits[j].pos || + subexonSplits[i].chrId != subexonSplits[j].chrId ) + break ; + if ( subexonSplits[j].splitType != 0 ) + { + newSplitType = subexonSplits[j].splitType ; + newStrand = subexonSplits[j].strand ; + break ; + } + } + + } + /*if ( subexonSplits[i].pos == 154464157 ) + { + printf( "force conversion: %d %d %d. %d %d\n", subexonSplits[i].pos, subexonSplits[i].splitType, subexonSplits[i].weight, subexonSplits[i + 1].pos, subexonSplits[i + 1].splitType ) ; + }*/ + subexonSplits[i].splitType = newSplitType ; + subexonSplits[i].strand = newStrand ; + } + + /*for ( i = 0 ; i < splitCnt ; ++i ) + { + printf( "%d: type=%d splitType=%d weight=%d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType, subexonSplits[i].weight ) ; + }*/ + + // Build subexons from the collected split sites. + + std::vector<struct _subexon> subexons ; + int diffCnt = 0 ; // |start of subexon split| - |end of subexon split| + int seCnt = 0 ; + for ( i = 0 ; i < splitCnt - 1 ; ++i ) + { + struct _subexon se ; + /*if ( subexonSplits[i + 1].pos == 144177260 ) + { + printf( "%d %d %d: %d %d %d. %d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType, + subexonSplits[i + 1].pos, subexonSplits[i + 1].type, subexonSplits[i + 1].splitType, diffCnt ) ; + }*/ + + if ( subexonSplits[i].type == 1 ) + diffCnt += subexonSplits[i].weight ; + else + diffCnt -= subexonSplits[i].weight ; + + if ( subexonSplits[i + 1].chrId != subexonSplits[i].chrId ) + { + diffCnt = 0 ; + continue ; + } + + if ( diffCnt == 0 ) // the interval between subexon + continue ; + + se.chrId = subexonSplits[i].chrId ; + se.start = subexonSplits[i].pos ; + se.leftType = subexonSplits[i].splitType ; + se.leftStrand = subexonSplits[i].strand ; + if ( subexonSplits[i].type == 2 ) + { + se.leftStrand = 0 ; + ++se.start ; + } + + se.end = subexonSplits[i + 1].pos ; + se.rightType = subexonSplits[i + 1].splitType ; + se.rightStrand = subexonSplits[i + 1].strand ; + if ( subexonSplits[i + 1].type == 1 ) + { + se.rightStrand = 0 ; + --se.end ; + } + + /*if ( se.end == 24613649 ) + { + //printf( "%d %d %d: %d %d %d. %d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType, + // subexonSplits[i + 1].pos, subexonSplits[i + 1].type, subexonSplits[i + 1].splitType, diffCnt ) ; + printf( "%d %d %d. %d %d %d\n", se.start, se.leftType, se.leftStrand, se.end, se.rightType, se.rightStrand ) ; + }*/ + + if ( se.start > se.end ) //Note: this handles the case of repeated subexon split. + { + // handle the case in sample 0: [...[..] + // in sample 1: [..]...] + if ( seCnt > 0 && se.end == subexons[seCnt - 1].end && subexons[seCnt - 1].rightType < se.rightType ) + { + subexons[seCnt - 1].rightType = se.rightType ; + subexons[seCnt - 1].rightStrand = se.rightStrand ; + } + continue ; + } + se.leftClassifier = se.rightClassifier = 0 ; + se.lcCnt = se.rcCnt = 0 ; + + /*if ( 1 ) //se.chrId == 25 ) + { + printf( "%d: %d-%d: %d %d\n", se.chrId, se.start, se.end, se.leftType, se.rightType ) ; + }*/ + + + se.next = se.prev = NULL ; + se.nextCnt = se.prevCnt = 0 ; + subexons.push_back( se ) ; + ++seCnt ; + } + subexonSplits.clear() ; + std::vector<struct _subexonSplit>().swap( subexonSplits ) ; + + // Adjust the split type. + seCnt = subexons.size() ; + for ( i = 1 ; i < seCnt ; ++i ) + { + if ( subexons[i - 1].end + 1 == subexons[i].start ) + { + if ( subexons[i - 1].rightType == 0 ) + subexons[i - 1].rightType = subexons[i].leftType ; + if ( subexons[i].leftType == 0 ) + subexons[i].leftType = subexons[i - 1].rightType ; + } + } + + // Merge the adjacent soft boundaries + std::vector<struct _subexon> rawSubexons = subexons ; + int exonCnt = exons.size() ; + subexons.clear() ; + + k = 0 ; // hold index for exon. + for ( i = 0 ; i < seCnt ; ) + { + /*if ( rawSubexons[k].rightType == 0 && rawSubexons[i].leftType == 0 + && rawSubexons[k].end + 1 == rawSubexons[i].start ) + { + rawSubexons[k].end = rawSubexons[i].end ; + rawSubexons[k].rightType = rawSubexons[i].rightType ; + rawSubexons[k].rightStrand = rawSubexons[i].rightStrand ; + } + else + { + subexons.push_back( rawSubexons[k] ) ; + k = i ; + }*/ + + while ( k < exonCnt && ( exons[k].chrId < rawSubexons[i].chrId + || ( exons[k].chrId == rawSubexons[i].chrId && exons[k].start < rawSubexons[i].start ) ) ) + ++k ; + + for ( j = i + 1 ; j < seCnt ; ++j ) + { + if ( rawSubexons[j - 1].chrId != rawSubexons[j].chrId || rawSubexons[j - 1].rightType != 0 || rawSubexons[j].leftType != 0 + || ( fileCnt > 1 && rawSubexons[j - 1].end + 1 != rawSubexons[j].start ) + || ( fileCnt == 1 && rawSubexons[j - 1].end + 50 < rawSubexons[j].start ) ) + break ; + } + // rawsubexons[i...j-1] will be merged. + + /*if ( rawSubexons[i].start == 119625875 ) + { + printf( "merge j-1: %d %d %d %d\n", rawSubexons[j - 1].end, rawSubexons[j - 1].rightType, + rawSubexons[j].start, rawSubexons[j].leftType ) ; + }*/ + bool merge = true ; + if ( rawSubexons[i].leftType == 1 && rawSubexons[j - 1].rightType == 2 && j - i > 1 + && rawSubexons[j - 1].end - rawSubexons[i].start >= 1000 ) + { + merge = false ; + int sampleSupport = 0 ; + for ( int l = k ; l < exonCnt ; ++l ) + { + if ( exons[l].chrId != rawSubexons[i].chrId || exons[l].start > rawSubexons[i].start ) + break ; + if ( exons[l].end == rawSubexons[j - 1].end ) + { + merge = true ; + sampleSupport = exons[l].sampleSupport ; + break ; + } + } + + if ( merge == true && rawSubexons[j - 1].end - rawSubexons[i].start >= 1000 ) + { + if ( sampleSupport <= 0.2 * fileCnt ) + { + merge = false ; + } + } + + if ( merge == false ) + { + if ( j - i >= 3 ) + { + rawSubexons[i].end = rawSubexons[ (i + j - 1) / 2 ].start ; + rawSubexons[j - 1].start = rawSubexons[ (i + j - 1) / 2].end ; + } + + if ( rawSubexons[i].end + 1 == rawSubexons[j - 1].start ) + { + --rawSubexons[i].end ; + ++rawSubexons[j - 1].start ; + } + subexons.push_back( rawSubexons[i] ) ; + subexons.push_back( rawSubexons[j - 1] ) ; + } + } + + if ( merge ) + { + rawSubexons[i].end = rawSubexons[j - 1].end ; + rawSubexons[i].rightType = rawSubexons[j - 1].rightType ; + rawSubexons[i].rightStrand = rawSubexons[j - 1].rightStrand ; + + if ( rawSubexons[i].leftType == 0 && rawSubexons[i].rightType != 0 ) + { + rawSubexons[i].start = rawSubexons[ ( i + j - 1 ) / 2 ].start ; + } + else if ( rawSubexons[i].rightType == 0 && rawSubexons[i].leftType != 0 ) + { + rawSubexons[i].end = rawSubexons[ ( i + j - 1 ) / 2 ].end ; + } + + subexons.push_back( rawSubexons[i] ) ; + } + + i = j ; + } + exons.clear() ; + std::vector<struct _interval>().swap( exons ) ; + + // Remove overhang, ir subexons intron created after putting multiple sample to gether. + // eg: s0: [......) + // s1: [...]--------[....] + // s2: [...]..)-----[....] + // Though the overhang from s2 is filtered in readin, there will a new overhang created combining s0,s1. + // But be careful about how to compute the classifier for the overhang part contributed from s0. + // Furthermore, note that the case of single-exon island showed up in intron retention region after combining is not possible when get here. + // eg: s0:[...]-----[...] + // s1: (.) + // s2:[.............] + // After merge adjacent soft boundaries, the single-exon island will disappear. + rawSubexons = subexons ; + seCnt = subexons.size() ; + subexons.clear() ; + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( ( rawSubexons[i].leftType == 2 && rawSubexons[i].rightType == 1 ) // ir + || ( rawSubexons[i].leftType == 2 && rawSubexons[i].rightType == 0 ) // overhang + || ( rawSubexons[i].leftType == 0 && rawSubexons[i].rightType == 1 ) ) + continue ; + subexons.push_back( rawSubexons[i] ) ; + } + + // Remove the single-exon island if it overlaps with intron retentioned or overhang. + rawSubexons = subexons ; + seCnt = subexons.size() ; + subexons.clear() ; + k = 0 ; + std::sort( intervalIrOverhang.begin(), intervalIrOverhang.end(), CompInterval ) ; + int irOverhangCnt = intervalIrOverhang.size() ; + + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( rawSubexons[i].leftType != 0 || rawSubexons[i].rightType != 0 ) + { + subexons.push_back( rawSubexons[i] ) ; + continue ; + } + + while ( k < irOverhangCnt ) + { + // Locate the interval that before the island + if ( intervalIrOverhang[k].chrId < rawSubexons[i].chrId + || ( intervalIrOverhang[k].chrId == rawSubexons[i].chrId && intervalIrOverhang[k].end < rawSubexons[i].start ) ) + { + ++k ; + continue ; + } + break ; + } + bool overlap = false ; + for ( j = k ; j < irOverhangCnt ; ++j ) + { + if ( intervalIrOverhang[j].chrId > rawSubexons[i].chrId || intervalIrOverhang[j].start > rawSubexons[i].end ) + break ; + if ( ( intervalIrOverhang[j].start <= rawSubexons[i].start && intervalIrOverhang[j].end >= rawSubexons[i].start ) + || ( intervalIrOverhang[j].start <= rawSubexons[i].end && intervalIrOverhang[j].end >= rawSubexons[i].end ) ) + { + overlap = true ; + break ; + } + } + + if ( !overlap ) + subexons.push_back( rawSubexons[i] ) ; + } + rawSubexons.clear() ; + std::vector<struct _subexon>().swap( rawSubexons ) ; + + intervalIrOverhang.clear() ; + std::vector<struct _interval>().swap( intervalIrOverhang ) ; + + // Create the dummy intervals. + seCnt = subexons.size() ; + std::vector<struct _intronicInfo> intronicInfos ; + std::vector<struct _seInterval> seIntervals ; + std::vector<struct _subexonSupplement> subexonInfo ; + + //subexonInfo.resize( seCnt ) ; + for ( i = 0 ; i < seCnt ; ++i ) + { + struct _seInterval ni ; // new interval + ni.start = subexons[i].start ; + ni.end = subexons[i].end ; + ni.type = 0 ; + ni.idx = i ; + ni.chrId = subexons[i].chrId ; + seIntervals.push_back( ni ) ; + + /*if ( subexons[i].end == 42671717 ) + { + printf( "%d: %d-%d: %d\n", subexons[i].chrId, subexons[i].start, subexons[i].end, subexons[i].rightType ) ; + }*/ + //subexonInfo[i].prevSupport = subexonInfo[i].nextSupport = NULL ; + + /*int nexti ; + for ( nexti = i + 1 ; nexti < seCnt ; ++nexti ) + if ( subexons[ nexti ].leftType == 0 && subexons[nexti].rightType == 0 )*/ + + if ( i < seCnt - 1 && subexons[i].chrId == subexons[i + 1].chrId && + subexons[i].end + 1 < subexons[i + 1].start && + subexons[i].rightType + subexons[i + 1].leftType != 0 ) + { + // Only consider the intervals like ]..[,]...(, )...[ + // The case like ]...] is actaully things like ][...] in subexon perspective, + // so they won't pass the if-statement + struct _intronicInfo nii ; // new intronic info + ni.start = subexons[i].end + 1 ; + ni.end = subexons[i + 1].start - 1 ; + ni.type = 1 ; + ni.idx = intronicInfos.size() ; + seIntervals.push_back( ni ) ; + + nii.chrId = subexons[i].chrId ; + nii.start = ni.start ; + nii.end = ni.end ; + nii.leftSubexonIdx = i ; + nii.rightSubexonIdx = i + 1 ; + nii.irClassifier = 0 ; + nii.irCnt = 0 ; + nii.validIrCnt = 0 ; + nii.leftOverhang.cnt = 0 ; + nii.leftOverhang.validCnt = 0 ; + nii.leftOverhang.length = 0 ; + nii.leftOverhang.classifier = 0 ; + nii.rightOverhang.cnt = 0 ; + nii.rightOverhang.validCnt = 0 ; + nii.rightOverhang.length = 0 ; + nii.rightOverhang.classifier = 0 ; + intronicInfos.push_back( nii ) ; + /*if ( nii.end == 23667 ) + { + printf( "%d %d. %d (%d %d %d)\n", nii.start, nii.end, subexons[i].rightType, subexons[i+1].start, subexons[i + 1].end, subexons[i + 1].leftType ) ; + }*/ + } + } + + // Go through all the files to get some statistics number + double avgIrPiRatio = 0 ; + double avgIrPiCov = 0 ; + double irPiRatio, irKRatio[2], irThetaRatio[2] ; // Some statistical results + double irPiCov, irKCov[2], irThetaCov[2] ; + + double avgOverhangPiRatio = 0 ; + double avgOverhangPiCov = 0 ; + double overhangPiRatio, overhangKRatio[2], overhangThetaRatio[2] ; // Some statistical results + double overhangPiCov, overhangKCov[2], overhangThetaCov[2] ; + + for ( k = 0 ; k < fileCnt ; ++k ) + { + fp = fopen( files[k], "r" ) ; + + while ( fgets( buffer, sizeof( buffer), fp ) != NULL ) + { + if ( buffer[0] == '#' ) + { + char buffer2[100] ; + sscanf( buffer, "%s", buffer2 ) ; + if ( !strcmp( buffer2, "#fitted_ir_parameter_ratio:" ) ) + { + // TODO: ignore certain samples if the coverage seems wrong. + sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", + buffer2, buffer2, &irPiRatio, buffer2, &irKRatio[0], buffer2, &irThetaRatio[0], + buffer2, &irKRatio[1], buffer2, &irThetaRatio[1] ) ; + avgIrPiRatio += irPiRatio ; + } + else if ( !strcmp( buffer2, "#fitted_ir_parameter_cov:" ) ) + { + } + else if ( !strcmp( buffer2, "#fitted_overhang_parameter_ratio:" ) ) + { + sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", + buffer2, buffer2, &overhangPiRatio, buffer2, &overhangKRatio[0], buffer2, &overhangThetaRatio[0], + buffer2, &overhangKRatio[1], buffer2, &overhangThetaRatio[1] ) ; + avgOverhangPiRatio += overhangPiRatio ; + } + } + else + break ; + } + fclose( fp ) ; + } + avgIrPiRatio /= fileCnt ; + avgOverhangPiRatio /= fileCnt ; + + // Go through all the files to put statistical results into each subexon. + std::vector< struct _subexon > sampleSubexons ; + int subexonCnt = subexons.size() ; + for ( k = 0 ; k < fileCnt ; ++k ) + { + //if ( k == 220 ) + // exit( 1 ) ; + fp = fopen( files[k], "r" ) ; + struct _subexon se ; + struct _subexonSplit sp ; + char chrName[50] ; + + sampleSubexons.clear() ; + + int tag = 0 ; + while ( fgets( buffer, sizeof( buffer), fp ) != NULL ) + { + if ( buffer[0] == '#' ) + { + char buffer2[200] ; + sscanf( buffer, "%s", buffer2 ) ; + if ( !strcmp( buffer2, "#fitted_ir_parameter_ratio:" ) ) + { + sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", + buffer2, buffer2, &irPiRatio, buffer2, &irKRatio[0], buffer2, &irThetaRatio[0], + buffer2, &irKRatio[1], buffer2, &irThetaRatio[1] ) ; + } + else if ( !strcmp( buffer2, "#fitted_ir_parameter_cov:" ) ) + { + sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", + buffer2, buffer2, &irPiCov, buffer2, &irKCov[0], buffer2, &irThetaCov[0], + buffer2, &irKCov[1], buffer2, &irThetaCov[1] ) ; + } + else if ( !strcmp( buffer2, "#fitted_overhang_parameter_ratio:" ) ) + { + sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", + buffer2, buffer2, &overhangPiRatio, buffer2, &overhangKRatio[0], buffer2, &overhangThetaRatio[0], + buffer2, &overhangKRatio[1], buffer2, &overhangThetaRatio[1] ) ; + } + else if ( !strcmp( buffer2, "#fitted_overhang_parameter_cov:" ) ) + { + sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", + buffer2, buffer2, &overhangPiCov, buffer2, &overhangKCov[0], buffer2, &overhangThetaCov[0], + buffer2, &overhangKCov[1], buffer2, &overhangThetaCov[1] ) ; + } + continue ; + } + else + break ; + + //SubexonGraph::InputSubexon( buffer, alignments, se, true ) ; + //sampleSubexons.push_back( se ) ; + } + + //int sampleSubexonCnt = sampleSubexons.size() ; + int intervalCnt = seIntervals.size() ; + //for ( i = 0 ; i < sampleSubexonCnt ; ++i ) + int iterCnt = 0 ; + while ( 1 ) + { + if ( iterCnt > 0 && fgets( buffer, sizeof( buffer), fp ) == NULL) + break ; + ++iterCnt ; + + struct _subexon se ; + SubexonGraph::InputSubexon( buffer, alignments, se, true ) ; + + while ( tag < intervalCnt ) + { + if ( seIntervals[tag].chrId < se.chrId || + ( seIntervals[tag].chrId == se.chrId && seIntervals[tag].end < se.start ) ) + { + ++tag ; + continue ; + } + else + break ; + } + + for ( j = tag ; j < intervalCnt ; ++j ) + { + if ( seIntervals[j].start > se.end || seIntervals[j].chrId > se.chrId ) // terminate if no overlap. + break ; + int idx ; + + if ( seIntervals[j].type == 0 ) + { + idx = seIntervals[j].idx ; + if ( subexons[idx].leftType == 1 && se.leftType == 1 && subexons[idx].start == se.start ) + { + double tmp = se.leftClassifier ; + if ( se.leftClassifier == 0 ) + tmp = 1e-7 ; + subexons[idx].leftClassifier -= 2.0 * log( tmp ) ; + ++subexons[idx].lcCnt ; + subexons[idx].prev = MergePositions( subexons[idx].prev, subexons[idx].prevCnt, + se.prev, se.prevCnt, subexons[idx].prevCnt ) ; + + if ( se.rightType == 0 ) // a gene end here + { + for ( int l = idx ; l < subexonCnt ; ++l ) + { + if ( l > idx && ( subexons[l].end > subexons[l - 1].start + 1 + || subexons[l].chrId != subexons[l - 1].chrId ) ) + break ; + if ( subexons[l].rightType == 2 ) + { + double adjustAvgDepth = se.avgDepth ; + if ( se.end - se.start + 1 >= 100 ) + adjustAvgDepth += se.avgDepth * 100.0 / ( se.end - se.start + 1 ) ; + else + adjustAvgDepth *= 2 ; + double p = GetPValueOfGeneEnd( adjustAvgDepth ) ; + //if ( se.end - se.start + 1 >= 500 && p > 0.001 ) + // p = 0.001 ; + + subexons[l].rightClassifier -= 2.0 * log( p ) ; + ++subexons[l].rcCnt ; + break ; + } + } + } + } + //if ( se.chrId == 25 ) + // printf( "(%d %d %d. %d) (%d %d %d. %d)\n", se.chrId, se.start, se.end, se.rightType, subexons[idx].chrId, subexons[idx].start, subexons[idx].end, subexons[idx].rightType ) ; + if ( subexons[idx].rightType == 2 && se.rightType == 2 && subexons[idx].end == se.end ) + { + double tmp = se.rightClassifier ; + if ( se.rightClassifier == 0 ) + tmp = 1e-7 ; + subexons[idx].rightClassifier -= 2.0 * log( tmp ) ; + ++subexons[idx].rcCnt ; + + subexons[idx].next = MergePositions( subexons[idx].next, subexons[idx].nextCnt, + se.next, se.nextCnt, subexons[idx].nextCnt ) ; + + if ( se.leftType == 0 ) + { + for ( int l = idx ; l >= 0 ; --l ) + { + if ( l < idx && ( subexons[l].end < subexons[l + 1].start - 1 + || subexons[l].chrId != subexons[l + 1].chrId ) ) + break ; + if ( subexons[l].leftType == 1 ) + { + double adjustAvgDepth = se.avgDepth ; + if ( se.end - se.start + 1 >= 100 ) + adjustAvgDepth += se.avgDepth * 100.0 / ( se.end - se.start + 1 ) ; + else + adjustAvgDepth *= 2 ; + double p = GetPValueOfGeneEnd( adjustAvgDepth ) ; + //if ( se.end - se.start + 1 >= 500 && p >= 0.001 ) + // p = 0.001 ; + subexons[l].leftClassifier -= 2.0 * log( p ) ; + ++subexons[l].lcCnt ; + break ; + } + } + } + } + + if ( subexons[idx].leftType == 0 && subexons[idx].rightType == 0 + && se.leftType == 0 && se.rightType == 0 ) // the single-exon island. + { + double tmp = se.leftClassifier ; + if ( se.leftClassifier == 0 ) + tmp = 1e-7 ; + subexons[idx].leftClassifier -= 2.0 * log( tmp ) ; + subexons[idx].rightClassifier = subexons[idx].leftClassifier ; + ++subexons[idx].lcCnt ; + ++subexons[idx].rcCnt ; + } + } + else if ( seIntervals[j].type == 1 ) + { + idx = seIntervals[j].idx ; + // Overlap on the left part of intron + if ( se.start <= intronicInfos[idx].start && se.end < intronicInfos[idx].end + && subexons[ intronicInfos[idx].leftSubexonIdx ].rightType != 0 ) + { + int len = se.end - intronicInfos[idx].start + 1 ; + intronicInfos[idx].leftOverhang.length += len ; + ++intronicInfos[idx].leftOverhang.cnt ; + + // Note that the sample subexon must have a soft boundary at right hand side, + // otherwise, this part is not an intron and won't show up in intronic Info. + if ( se.leftType == 2 ) + { + if ( se.leftRatio > 0 && se.avgDepth > 1 ) + { + ++intronicInfos[idx].leftOverhang.validCnt ; + + double update = GetUpdateMixtureGammaClassifier( se.leftRatio, se.avgDepth, + overhangPiRatio, overhangKRatio, overhangThetaRatio, + overhangPiCov, overhangKCov, overhangThetaCov, false ) ; + intronicInfos[idx].leftOverhang.classifier += update ; + } + } + else if ( se.leftType == 1 ) + { + ++intronicInfos[idx].leftOverhang.validCnt ; + double update = GetUpdateMixtureGammaClassifier( 1.0, se.avgDepth, + overhangPiRatio, overhangKRatio, overhangThetaRatio, + overhangPiCov, overhangKCov, overhangThetaCov, true ) ; + intronicInfos[idx].leftOverhang.classifier += update ; + + int seIdx = intronicInfos[idx].leftSubexonIdx ; + subexons[seIdx].rightClassifier -= 2.0 * log( GetPValueOfGeneEnd( se.avgDepth ) ) ; + ++subexons[ seIdx ].rcCnt ; + } + // ignore the contribution of single-exon island here? + } + // Overlap on the right part of intron + else if ( se.start > intronicInfos[idx].start && se.end >= intronicInfos[idx].end + && subexons[ intronicInfos[idx].rightSubexonIdx ].leftType != 0 ) + { + int len = intronicInfos[idx].end - se.start + 1 ; + intronicInfos[idx].rightOverhang.length += len ; + ++intronicInfos[idx].rightOverhang.cnt ; + + // Note that the sample subexon must have a soft boundary at left hand side, + // otherwise, this won't show up in intronic Info + if ( se.rightType == 1 ) + { + if ( se.rightRatio > 0 && se.avgDepth > 1 ) + { + ++intronicInfos[idx].rightOverhang.validCnt ; + + double update = GetUpdateMixtureGammaClassifier( se.rightRatio, se.avgDepth, + overhangPiRatio, overhangKRatio, overhangThetaRatio, + overhangPiCov, overhangKCov, overhangThetaCov, false ) ; + intronicInfos[idx].rightOverhang.classifier += update ; + } + } + else if ( se.rightType == 2 ) + { + ++intronicInfos[idx].rightOverhang.validCnt ; + + double update = GetUpdateMixtureGammaClassifier( 1, se.avgDepth, + overhangPiRatio, overhangKRatio, overhangThetaRatio, + overhangPiCov, overhangKCov, overhangThetaCov, true ) ; + intronicInfos[idx].rightOverhang.classifier += update ; + + int seIdx = intronicInfos[idx].rightSubexonIdx ; + /*if ( subexons[ seIdx ].start == 6873648 ) + { + printf( "%lf %lf: %lf %lf %lf\n", subexons[seIdx].leftClassifier, GetPValueOfGeneEnd( se.avgDepth ), se.avgDepth, sqrt( se.avgDepth ), log( se.avgDepth ) ) ; + }*/ + subexons[seIdx].leftClassifier -= 2.0 * log( GetPValueOfGeneEnd( se.avgDepth ) ) ; + ++subexons[ seIdx ].lcCnt ; + } + } + // Intron is fully contained in this sample subexon, then it is a ir candidate + else if ( se.start <= intronicInfos[idx].start && se.end >= intronicInfos[idx].end ) + { + if ( se.leftType == 2 && se.rightType == 1 ) + { + double ratio = regions.PickLeftAndRightRatio( se.leftRatio, se.rightRatio ) ; + ++intronicInfos[idx].irCnt ; + if ( ratio > 0 && se.avgDepth > 1 ) + { + double update = GetUpdateMixtureGammaClassifier( ratio, se.avgDepth, + irPiRatio, irKRatio, irThetaRatio, + irPiCov, irKCov, irThetaCov, true ) ; + //if ( intronicInfos[idx].start == 37617368 ) + // printf( "hi %lf %d %d: %d %d\n", update, se.start, se.end, intronicInfos[idx].start, intronicInfos[idx].end ) ; + intronicInfos[idx].irClassifier += update ; + ++intronicInfos[idx].validIrCnt ; + } + } + else if ( se.leftType == 1 || se.rightType == 2 ) + { + //intronicInfos[idx].irClassifier += LogGammaDensity( 4.0, irKRatio[1], irThetaRatio[1] ) + // - LogGammaDensity( 4.0, irKRatio[0], irThetaRatio[0] ) ; + /*if ( se.start == 37617368 ) + { + printf( "%lf: %lf %lf\n", se.avgDepth, MixtureGammaAssignment( ( irKCov[0] - 1 ) * irThetaCov[0], irPiRatio, irKCov, irThetaCov ), + MixtureGammaAssignment( TransformCov( 4.0 ), irPiRatio, irKCov, irThetaCov ) ) ; + }*/ + if ( se.avgDepth > 1 ) + { + // let the depth be the threshold to determine. + double update = GetUpdateMixtureGammaClassifier( 4.0, se.avgDepth, + irPiRatio, irKRatio, irThetaRatio, + irPiCov, irKCov, irThetaCov, true ) ; + //if ( intronicInfos[idx].start == 36266630 ) + // printf( "hi %lf %d %d: %d %d\n", update, se.start, se.end, intronicInfos[idx].start, intronicInfos[idx].end ) ; + intronicInfos[idx].irClassifier += update ; + ++intronicInfos[idx].irCnt ; + ++intronicInfos[idx].validIrCnt ; + } + } + else + { + // the intron is contained in a overhang subexon from the sample or single-exon island + } + } + // sample subexon is contained in the intron. + else + { + // Do nothing. + } + } + } + + //if ( se.nextCnt > 0 ) + delete[] se.next ; + //if ( se.prevCnt > 0 ) + delete[] se.prev ; + } + fclose( fp ) ; + + /*for ( i = 0 ; i < sampleSubexonCnt ; ++i ) + { + if ( sampleSubexons[i].nextCnt > 0 ) + delete[] sampleSubexons[i].next ; + if ( sampleSubexons[i].prevCnt > 0 ) + delete[] sampleSubexons[i].prev ; + }*/ + } + + CleanUpSubexonConnections( subexons ) ; + + // Convert the temporary statistics number into formal statistics result. + for ( i = 0 ; i < subexonCnt ; ++i ) + { + struct _subexon &se = subexons[i] ; + if ( se.leftType == 0 && se.rightType == 0 ) // single-exon txpt. + { + se.leftClassifier = se.rightClassifier = 1 - chicdf( se.rightClassifier, 2 * se.rcCnt ) ; + } + else + { + if ( se.leftType == 1 ) + { + se.leftClassifier = 1 - chicdf( se.leftClassifier, 2 * se.lcCnt ) ; + } + else + se.leftClassifier = -1 ; + + if ( se.rightType == 2 ) + se.rightClassifier = 1 - chicdf( se.rightClassifier, 2 * se.rcCnt ) ; + else + se.rightClassifier = -1 ; + } + } + + int iiCnt = intronicInfos.size() ; //intronicInfo count + for ( i = 0 ; i < iiCnt ; ++i ) + { + struct _intronicInfo &ii = intronicInfos[i] ; + if ( ii.validIrCnt > 0 ) + { + for ( j = 0 ; j < fileCnt - ii.validIrCnt ; ++j ) + { + ii.irClassifier -= log( 10.0 ) ; + } + /*if ( ii.validIrCnt < fileCnt * 0.15 ) + ii.irClassifier -= log( 1000.0 ) ; + else if ( ii.validIrCnt < fileCnt * 0.5 ) + ii.irClassifier -= log( 100.0 ) ;*/ + ii.irClassifier = (double)1.0 / ( 1.0 + exp( ii.irClassifier + log( 1 - avgIrPiRatio ) - log( avgIrPiRatio ) ) ) ; + } + else + ii.irClassifier = -1 ; + + if ( ii.leftOverhang.validCnt > 0 ) + ii.leftOverhang.classifier = (double)1.0 / ( 1.0 + exp( ii.leftOverhang.classifier + + log( 1 - avgOverhangPiRatio ) - log( avgOverhangPiRatio ) ) ) ; + else + ii.leftOverhang.classifier = -1 ; + + if ( ii.rightOverhang.validCnt > 0 ) + ii.rightOverhang.classifier = (double)1.0 / ( 1.0 + exp( ii.rightOverhang.classifier + + log( 1 - avgOverhangPiRatio ) - log( avgOverhangPiRatio ) ) ) ; + else + ii.rightOverhang.classifier = -1 ; + } + + // Change the classifier for the hard boundaries if its adjacent intron has intron retention classifier + // which collide with overhang subexon. + int intervalCnt = seIntervals.size() ; + for ( i = 0 ; i < intervalCnt ; ++i ) + { + if ( seIntervals[i].type == 1 && intronicInfos[ seIntervals[i].idx ].irCnt > 0 ) + { + int idx = seIntervals[i].idx ; + if ( intronicInfos[idx].leftOverhang.cnt > 0 ) + { + int k = seIntervals[i - 1].idx ; + // Should aim for more conservative? + if ( subexons[k].rightClassifier > intronicInfos[idx].leftOverhang.classifier ) + subexons[k].rightClassifier = intronicInfos[idx].leftOverhang.classifier ; + } + + if ( intronicInfos[idx].rightOverhang.cnt > 0 ) + { + int k = seIntervals[i + 1].idx ; + if ( subexons[k].leftClassifier > intronicInfos[idx].rightOverhang.classifier ) + subexons[k].leftClassifier = intronicInfos[idx].rightOverhang.classifier ; + } + } + } + + // Output the result. + for ( i = 0 ; i < intervalCnt ; ++i ) + { + if ( seIntervals[i].type == 0 ) + { + struct _subexon &se = subexons[ seIntervals[i].idx ] ; + + char ls, rs ; + + ls = StrandNumToSymbol( se.leftStrand ) ; + rs = StrandNumToSymbol( se.rightStrand ) ; + + printf( "%s %d %d %d %d %c %c -1 -1 -1 %lf %lf ", alignments.GetChromName( se.chrId ), se.start, se.end, + se.leftType, se.rightType, ls, rs, se.leftClassifier, se.rightClassifier ) ; + if ( i > 0 && seIntervals[i - 1].chrId == seIntervals[i].chrId + && seIntervals[i - 1].end + 1 == seIntervals[i].start + && !( seIntervals[i - 1].type == 0 && + subexons[ seIntervals[i - 1].idx ].rightType != se.leftType ) + && !( seIntervals[i - 1].type == 1 && intronicInfos[ seIntervals[i - 1].idx ].irCnt == 0 + && intronicInfos[ seIntervals[i - 1].idx ].rightOverhang.cnt == 0 ) + && ( se.prevCnt == 0 || se.start - 1 != se.prev[ se.prevCnt - 1 ] ) ) // The connection showed up in the subexon file. + { + printf( "%d ", se.prevCnt + 1 ) ; + for ( j = 0 ; j < se.prevCnt ; ++j ) + printf( "%d ", se.prev[j] ) ; + printf( "%d ", se.start - 1 ) ; + } + else + { + printf( "%d ", se.prevCnt ) ; + for ( j = 0 ; j < se.prevCnt ; ++j ) + printf( "%d ", se.prev[j] ) ; + } + + if ( i < intervalCnt - 1 && seIntervals[i].chrId == seIntervals[i + 1].chrId + && seIntervals[i].end == seIntervals[i + 1].start - 1 + && !( seIntervals[i + 1].type == 0 && + subexons[ seIntervals[i + 1].idx ].leftType != se.rightType ) + && !( seIntervals[i + 1].type == 1 && intronicInfos[ seIntervals[i + 1].idx ].irCnt == 0 + && intronicInfos[ seIntervals[i + 1].idx ].leftOverhang.cnt == 0 ) + && ( se.nextCnt == 0 || se.end + 1 != se.next[0] ) ) + { + printf( "%d %d ", se.nextCnt + 1, se.end + 1 ) ; + } + else + printf( "%d ", se.nextCnt ) ; + for ( j = 0 ; j < se.nextCnt ; ++j ) + printf( "%d ", se.next[j] ) ; + printf( "\n" ) ; + } + else if ( seIntervals[i].type == 1 ) + { + struct _intronicInfo &ii = intronicInfos[ seIntervals[i].idx ] ; + if ( ii.irCnt > 0 ) + { + printf( "%s %d %d 2 1 . . -1 -1 -1 %lf %lf 1 %d 1 %d\n", + alignments.GetChromName( ii.chrId ), ii.start, ii.end, + ii.irClassifier, ii.irClassifier, + seIntervals[i - 1].end, seIntervals[i + 1].start ) ; + } + else + { + // left overhang. + if ( ii.leftOverhang.cnt > 0 ) + { + printf( "%s %d %d 2 0 . . -1 -1 -1 %lf %lf 1 %d 0\n", + alignments.GetChromName( ii.chrId ), ii.start, + ii.start + ( ii.leftOverhang.length / ii.leftOverhang.cnt ) - 1, + ii.leftOverhang.classifier, ii.leftOverhang.classifier, + ii.start - 1 ) ; + } + + // right overhang. + if ( ii.rightOverhang.cnt > 0 ) + { + printf( "%s %d %d 0 1 . . -1 -1 -1 %lf %lf 0 1 %d\n", + alignments.GetChromName( ii.chrId ), + ii.end - ( ii.rightOverhang.length / ii.rightOverhang.cnt ) + 1, ii.end, + ii.rightOverhang.classifier, ii.rightOverhang.classifier, + ii.end + 1 ) ; + } + + } + } + } + + return 0 ; +} +