Mercurial > repos > lsong10 > psiclass
diff PsiCLASS-1.0.2/SubexonInfo.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/SubexonInfo.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,1214 @@ +// report the total depth for the segment +// Format: ./a.out alignments.bam splice_site +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +#define __STDC_FORMAT_MACROS +#include <inttypes.h> + +#include <algorithm> +#include <vector> +#include <math.h> + +#include "alignments.hpp" +#include "blocks.hpp" +#include "stats.hpp" + +#define ABS(x) ((x)<0?-(x):(x)) + +char usage[] = "./subexon-info alignment.bam intron.splice [options]\n" + "options:\n" + "\t--minDepth INT: the minimum coverage depth considered as part of a subexon (default: 2)\n" + "\t--noStats: do not compute the statistical scores (default: not used)\n" ; +char buffer[4096] ; + +int gMinDepth ; + +bool CompSplitSite( struct _splitSite a, struct _splitSite b ) +{ + if ( a.chrId < b.chrId ) + return true ; + else if ( a.chrId > b.chrId ) + return false ; + else if ( a.pos != b.pos ) + return a.pos < b.pos ; + else if ( a.type != b.type ) + return a.type < b.type ; // We want the start of exons comes first, since we are scanning the genome from left to right, + // so we can terminate the extension early and create a single-base subexon later. + else + return a.oppositePos < b.oppositePos ; +} + +bool CompBlocksByAvgDepth( struct _block a, struct _block b ) +{ + double avgA = a.depthSum / (double)( a.end - a.start + 1 ) ; + double avgB = b.depthSum / (double)( b.end - b.start + 1 ) ; + return avgA < avgB ; +} + +bool CompBlocksByRatio( struct _block a, struct _block b ) +{ + return a.ratio < b.ratio ; +} + +int CompDouble( const void *p1, const void *p2 ) +{ + double a = *(double *)p1 ; + double b = *(double *)p2 ; + if ( a > b ) + return 1 ; + else if ( a < b ) + return -1 ; + else + return 0 ; +} + +// Clean up the split sites; +void FilterAndSortSplitSites( std::vector<struct _splitSite> &sites ) +{ + std::sort( sites.begin(), sites.end(), CompSplitSite ) ; + int i, j, k, l ; + int size = sites.size() ; + + for ( i = 0 ; i < size ; ) + { + for ( j = i + 1 ; j < size ; ++j ) + if ( sites[j].chrId != sites[i].chrId || sites[j].pos != sites[i].pos || sites[j].type != sites[i].type ) + break ; + int maxSupport = 0 ; + for ( k = i ; k < j ; ++k ) + if ( sites[k].support > maxSupport ) + maxSupport = sites[k].support ; + + int strandCnt[2] = {0, 0} ; + char strand = sites[i].strand ; + for ( k = i ; k < j ; ++k ) + { + if ( sites[k].strand == '-' ) + strandCnt[0] += sites[k].support ; + else if ( sites[k].strand == '+' ) + strandCnt[1] += sites[k].support ; + + } + if ( strandCnt[0] > strandCnt[1] ) + strand = '-' ; + else if ( strandCnt[1] > strandCnt[0] ) + strand = '+' ; + + bool allOneExceptMax = false ; + if ( maxSupport >= 20 ) + { + allOneExceptMax = true ; + for ( k = i ; k < j ; ++k ) + { + if ( sites[k].support == maxSupport ) + continue ; + if ( sites[k].support > 1 ) + { + allOneExceptMax = false ; + break ; + } + } + } + + for ( k = i ; k < j ; ++k ) + { + if ( ( sites[k].support < 0.01 * maxSupport && sites[k].support <= 3 ) + || sites[k].support < 0.001 * maxSupport || sites[k].strand != strand // The introns from the different strand are filtered. + || ( sites[k].support < 0.02 * maxSupport && sites[k].mismatchSum >= 2 * sites[k].support ) + || ( allOneExceptMax && sites[k].support == 1 ) + || ( sites[k].support <= 2 && sites[k].mismatchSum >= 2 * sites[k].support ) + || ( maxSupport >= 2 && sites[k].support == 1 && ( ABS( sites[k].oppositePos - sites[k].pos - 1 ) >= 10000 || sites[k].mismatchSum != 0 ) ) ) + { + for ( l = i - 1 ; l >= 0 && sites[l].chrId == sites[i].chrId && sites[l].pos >= sites[k].oppositePos ; --l ) + { + if ( sites[l].pos == sites[k].oppositePos && sites[l].oppositePos == sites[k].pos ) + { + sites[l].support = -1 ; + sites[k].support = -1 ; + break ; + } + } + + for ( l = j ; l < size && sites[l].chrId == sites[i].chrId && sites[l].pos <= sites[k].oppositePos ; ++l ) + { + if ( sites[l].pos == sites[k].oppositePos && sites[l].oppositePos == sites[k].pos ) + { + sites[l].support = -1 ; + sites[k].support = -1 ; + break ; + } + } + } + /*else if ( sites[k].support <= 1 && sites[k].oppositePos - sites[k].pos + 1 >= 30000 ) + { + for ( l = j ; l < size && sites[l].chrId == sites[i].chrId && sites[l].pos <= sites[k].oppositePos ; ++l ) + { + if ( sites[l].pos == sites[k].oppositePos && sites[l].oppositePos == sites[k].pos ) + { + if ( l - j >= 20 ) + { + sites[l].support = -1 ; + sites[k].support = -1 ; + break ; + } + } + } + + }*/ + } + i = j ; + } + + k = 0 ; + for ( i = 0 ; i < size ; ++i ) + { + if ( sites[i].support > 0 ) + { + sites[k] = sites[i] ; + if ( sites[k].strand == '?' ) + sites[k].strand = '.' ; + ++k ; + } + } + sites.resize( k ) ; +} + +// Remove the same sites. +void KeepUniqSplitSites( std::vector< struct _splitSite> &sites ) +{ + int i, j ; + int size = sites.size() ; + int k = 0 ; + for ( i = 0 ; i < size ; ) + { + for ( j = i + 1 ; j < size ; ++j ) + if ( sites[j].chrId != sites[i].chrId || sites[j].pos != sites[i].pos || sites[j].type != sites[i].type ) + break ; + sites[k] = sites[i] ; + ++k ; + /*else + { + if ( sites[i].type != sites[k-1].type ) + { + printf( "%d\n", sites[i].pos ) ; + } + }*/ + i = j ; + } + sites.resize( k ) ; + + // For the sites that corresponds to the start of an exon, we remove the adjust to it and + /*for ( i = 1 ; i < size ; ++i ) + { + if ( sites[i].pos == sites[i - 1].pos && sites[i - 1].type == 2 ) + { + if ( sites[i].type == 1 ) + ++sites[i].pos ; + } + }*/ +} + +// Filter split sites that are extremely close to each other but on different strand. +void FilterNearSplitSites( std::vector< struct _splitSite> &sites ) +{ + int i, j ; + int size = sites.size() ; + int k = 0 ; + for ( i = 0 ; i < size - 1 ; ++i ) + { + if ( sites[i].support < 0 || sites[i].type != sites[i + 1].type || sites[i].chrId != sites[i + 1].chrId ) + continue ; + if ( sites[i + 1].pos - sites[i].pos <= 7 && + ( sites[i + 1].strand != sites[i].strand || + sites[i].strand == '?' ) ) + { + int tag = i ; + if ( sites[i + 1].support < sites[i].support ) + tag = i + 1 ; + sites[tag].support = -1 ; + int direction ; + if ( sites[tag].oppositePos < sites[tag].pos ) + direction = -1 ; + else + direction = 1 ; + + for ( j = tag ; j >= 0 && j < size ; j += direction ) + if ( sites[j].pos == sites[tag].oppositePos && sites[j].oppositePos == sites[tag].pos ) + { + sites[j].support = -1 ; + break ; + } + } + } + + for ( i = 0 ; i < size ; ++i ) + { + if ( sites[i].support > 0 ) + { + sites[k] = sites[i] ; + ++k ; + } + } + sites.resize( k ) ; +} + +void FilterRepeatSplitSites( std::vector<struct _splitSite> &sites ) +{ + int i, j ; + int size = sites.size() ; + int k = 0 ; + for ( i = 0 ; i < size ; ) + { + for ( j = i + 1 ; j < size ; ++j ) + { + if ( sites[j].pos != sites[i].pos || sites[j].type != sites[i].type || sites[i].chrId != sites[j].chrId ) + break ; + } + int max = -1 ; + int maxtag = 0 ; + for ( k = i ; k < j ; ++k ) + { + if ( sites[k].uniqSupport > max ) + { + max = sites[k].uniqSupport ; + maxtag = k ; + } + else if ( sites[k].uniqSupport == max && sites[k].support > sites[maxtag].support ) + { + maxtag = k ; + } + } + + if ( max > -1 ) + { + if ( sites[maxtag].uniqSupport > sites[maxtag].support * 0.1 ) + { + for ( k = i ; k < j ; ++k ) + if ( sites[k].uniqSupport < 0.05 * sites[k].support ) + { + sites[k].support = -1 ; + + int direction ; + if ( sites[k].oppositePos < sites[k].pos ) + direction = -1 ; + else + direction = 1 ; + int l ; + for ( l = k ; l >= 0 && l < size ; l += direction ) + if ( sites[l].pos == sites[k].oppositePos && sites[l].oppositePos == sites[k].pos ) + { + sites[l].support = -1 ; + break ; + } + } + } + else + { + for ( k = i ; k < j ; ++k ) + { + if ( sites[k].support <= 10 ) + { + sites[k].support = -1 ; + + int direction ; + if ( sites[k].oppositePos < sites[k].pos ) + direction = -1 ; + else + direction = 1 ; + int l ; + for ( l = k ; l >= 0 && l < size ; l += direction ) + if ( sites[l].pos == sites[k].oppositePos && sites[l].oppositePos == sites[k].pos ) + { + sites[l].support = -1 ; + break ; + } + } + } + } + } + + i = j ; + } + + k = 0 ; + for ( i = 0 ; i < size ; ++i ) + { + if ( sites[i].support > 0 ) + { + sites[k] = sites[i] ; + ++k ; + } + } + sites.resize( k ) ; + +} + + +// When k=1, the gamma distribution becomes exponential distribution, and can be optimized analytically.. +// Maximize: sum_i z_i( log( 1/theta e^{-x_i / theta} ) +/*double ThetaOfExponentialDistribution( double *x, double *z, int n ) +{ + double sumZ = 0; + double sumZX = 0 ; + for ( i = 0 ; i < n ; ++i ) + { + sumZ += z[i] ; + sumZX += z[i] * x[i] ; + } +}*/ + +// for boundK, if it is positive, it represent the upper bound. If it is negative, -boundK will be the lower bound for k. +// if boundK==0, there is no extra bound. +// The same logic for boundProduct, which bounds k*theta +void GradientDescentGammaDistribution( double &k, double &theta, double initK, double initTheta, double lowerBoundK, double upperBoundK, + double lowerBoundMean, double upperBoundMean, double *x, double *z, int n ) +{ + int i ; + k = initK ; + theta = initTheta ; + double c = 0.5 ; + int iterCnt = 1 ; + + double sumZ = 0 ; + double sumZX = 0 ; + double sumZLogX = 0 ; + double Hessian[2][2] ; // 0 for k, 1 for theta + double inverseHessian[2][2] ; + int tmp ; + + double positiveKRecord = -5, positiveThetaRecord = -5 ; // record the value of k, theta when theta, k becomes non-positive. + + for ( i = 0 ; i < n ; ++i ) + { + sumZ += z[i] ; + sumZX += z[i] * x[i] ; + sumZLogX += z[i] * log( x[i] ) ; + } + + while ( 1 ) + { + double gradK = 0 ; + double gradTheta = 0 ; + + double prevK = k ; + double prevTheta = theta ; + double digammaK = digammal( k ) ; + + gradK = sumZ * ( -log( theta ) - digammaK ) + sumZLogX ; + gradTheta = -sumZ * ( k / theta ) + sumZX / ( theta * theta ) ; + + Hessian[0][0] = -sumZ * trigamma( k, &tmp ) ; + Hessian[0][1] = -sumZ / theta ; // \partial l / ( \partial k \partial theta) + Hessian[1][0] = -sumZ / theta ; + Hessian[1][1] = sumZ * k / ( theta * theta ) - 2 * sumZX / ( theta * theta * theta ) ; + + double det = Hessian[0][0] * Hessian[1][1] - Hessian[0][1] * Hessian[1][0] ; + /*printf( "%s iter %d:\n", __func__, iterCnt ) ; + printf( "%lf %lf %lf %lf\n", sumZ, k, theta, sumZX ) ; + printf( "%lf %lf %lf\n", gradK, gradTheta, det ) ; + printf( "%lf %lf %lf %lf\n", Hessian[0][0], Hessian[0][1], Hessian[1][0], Hessian[1][1] ) ;*/ + if ( det <= 1e-4 && det >=-1e-4 ) + { + k = k + c / iterCnt * gradK ; + theta = theta + c / iterCnt * gradTheta ; + } + else + { + inverseHessian[0][0] = Hessian[1][1] / det ; + inverseHessian[0][1] = -Hessian[0][1] / det ; + inverseHessian[1][0] = -Hessian[1][0] / det ; + inverseHessian[1][1] = Hessian[0][0] / det ; + //printf( "%lf %lf %lf %lf: %lf\n=====\n", inverseHessian[0][0], inverseHessian[0][1], inverseHessian[1][0], inverseHessian[1][1], + // Hessian[1][0] * inverseHessian[0][1] + Hessian[1][1] * inverseHessian[1][1] ) ; + double step = 0.5 ; + k = k - step * ( inverseHessian[0][0] * gradK + inverseHessian[0][1] * gradTheta ) ; + theta = theta - step * ( inverseHessian[1][0] * gradK + inverseHessian[1][1] * gradTheta ) ; + + bool flag = false ; + if ( k <= 1e-6 ) + { + step = ( prevK - 1e-6 ) / ( inverseHessian[0][0] * gradK + inverseHessian[0][1] * gradTheta ) ; + + if ( ABS( theta - positiveThetaRecord ) < 1e-5 ) + flag = true ; + positiveThetaRecord = theta ; + } + if ( theta <= 1e-6 ) + { + double tmp = ( prevTheta - 1e-6 ) / ( inverseHessian[1][0] * gradK + inverseHessian[1][1] * gradTheta ) ; + if ( tmp < step ) + step = tmp ; + + if ( ABS( k - positiveKRecord ) < 1e-5 ) + flag = true ; + positiveKRecord = k ; + } + + if ( step != 0.5 ) + { + k = prevK - step * ( inverseHessian[0][0] * gradK + inverseHessian[0][1] * gradTheta ) ; + theta = prevTheta - step * ( inverseHessian[1][0] * gradK + inverseHessian[1][1] * gradTheta ) ; + } + + /*if ( flag ) + { + k = prevK ; + theta = prevTheta ; + break ; + }*/ + } + + if ( upperBoundK > 0 && k > upperBoundK ) + k = upperBoundK ; + else if ( lowerBoundK > 0 && k < lowerBoundK ) + k = lowerBoundK ; + + if ( upperBoundMean > 0 && k * theta > upperBoundMean ) + { + theta = upperBoundMean / k ; + } + else if ( lowerBoundMean > 0 && k * theta < lowerBoundMean ) + { + theta = lowerBoundMean / k ; + } + + if ( k <= 1e-6 ) + { + k = 1e-6 ; + } + + if ( theta <= 1e-6 ) + { + theta = 1e-6 ; + } + + double diff = ABS( prevK - k ) + ABS( prevTheta - theta ) ; + if ( diff < 1e-5 ) + break ; + + ++iterCnt ; + //if ( det <= 1e-4 && det >=-1e-4 && k >= 5000 ) //&& diff < 1 ) + if ( k >= 10000 ) //&& diff < 1 ) + { + k = prevK ; + theta = prevTheta ; + break ; + } + if ( iterCnt == 1000 ) + break ; + } +} + +double MixtureGammaEM( double *x, int n, double &pi, double *k, double *theta, int tries, double meanBound[2], int iter = 1000 ) +{ + int i ; + double *z = new double[n] ; // the expectation that it assigned to model 0. + double *oneMinusZ = new double[n] ; + int t = 0 ; + double history[5] = {-1, -1, -1, -1, -1} ; + double maxX = -1 ; + double sumX = 0 ; + if ( n <= 0 ) + return 0 ; + + for ( i = 0 ; i < n ; ++i ) + { + sumX += x[i] ; + if ( x[i] > maxX ) + maxX = x[i] ; + } + if ( maxX > meanBound[1] && meanBound[1] >= 0 ) + maxX = meanBound[1] ; + + /*if ( meanBound[1] == -1 ) + { + // The EM for coverage + maxX = 10.0 ; + }*/ + + while ( 1 ) + { + double npi, nk[2], ntheta[2] ; + double sum = 0 ; + for ( i = 0 ; i < n ; ++i ) + { + //double lf0 = -k[0] * log( theta[0] ) + ( k[0] - 1 ) * log( cov[i]) - cov[i] / theta[0] - lgamma( k[0] ); + //double lf1 = -k[1] * log( theta[1] ) + ( k[1] - 1 ) * log( cov[i]) - cov[i] / theta[1] - lgamma( k[1] ); + //z[i] = exp( lf0 + log( pi ) ) / ( exp( lf0 + log( pi ) ) + exp( lf1 + log( 1 - pi ) ) ) ; + if ( pi != 0 ) + z[i] = MixtureGammaAssignment( x[i], pi, k, theta ) ; + else + z[i] = 0 ; + /*if ( isnan( z[i] ) ) + { + printf( "nan: %lf %lf %lf %lf\n", x[i], pi, k, theta ) ; + }*/ + oneMinusZ[i] = 1 - z[i] ; + sum += z[i] ; + } + + // compute new pi. + npi = sum / n ; + + // Use gradient descent to compute new k and theta. + if ( 1 ) //pi > 0 ) + { + double bound ; + if ( meanBound[1] != -1 ) // the EM for ratio + { + bound = ( theta[1] * k[1] > 1 ) ? 1 : ( theta[1] * k[1] ) / ( 1 + tries ); + GradientDescentGammaDistribution( nk[0], ntheta[0], k[0], theta[0], k[1], -1, -1, bound, x, z, n ) ; // It seems setting an upper bound 1 for k[0] is not a good idea. + } + else + { + bound = ( theta[1] * k[1] > 1 ) ? 1 : ( theta[1] * k[1] ) / ( 1 + tries ) ; + GradientDescentGammaDistribution( nk[0], ntheta[0], k[0], theta[0], k[1], -1, meanBound[0], bound, x, z, n ) ; // It seems setting an upper bound 1 for k[0] is not a good idea. + } + GradientDescentGammaDistribution( nk[1], ntheta[1], k[1], theta[1], -1, k[0], theta[0] * k[0], maxX, x, oneMinusZ, n ) ; + } + else + { + GradientDescentGammaDistribution( nk[1], ntheta[1], k[1], theta[1], 0, 0, 0, 0, x, oneMinusZ, n ) ; + } + + double diff ; + if ( isnan( npi ) || isnan( nk[0] ) || isnan( nk[1] ) || isnan( ntheta[0] ) || isnan( ntheta[1] ) ) + { + delete[] z ; + delete[] oneMinusZ ; + return -1 ; + } + diff = ABS( nk[0] - k[0] ) + ABS( nk[1] - k[1] ) + + ABS( ntheta[0] - theta[0] ) + ABS( ntheta[1] - theta[1] ) ; // pi is fully determined by these 4 parameters. + if ( diff < 1e-4 ) + break ; + diff = ABS( nk[0] - history[1] ) + ABS( nk[1] - history[2] ) + + ABS( ntheta[0] - history[3] ) + ABS( ntheta[1] - history[4] ) ; // pi is fully determined by these 4 parameters. + if ( diff < 1e-4 ) + break ; + + history[0] = pi ; + history[1] = k[0] ; + history[2] = k[1] ; + history[3] = theta[0] ; + history[4] = theta[1] ; + + pi = npi ; + k[0] = nk[0] ; + k[1] = nk[1] ; + theta[0] = ntheta[0] ; + theta[1] = ntheta[1] ; + + /*double logLikelihood = 0 ; + for ( i = 0 ; i < n ; ++i ) + logLikelihood += log( pi * exp( LogGammaDensity( x[i], k[0], theta[0]) ) + + (1 - pi ) * exp( LogGammaDensity( x[i], k[1], theta[1] ) ) ) ;*/ + + //printf( "%d: %lf %lf %lf %lf %lf\n", t, pi, k[0], theta[0], k[1], theta[1] ) ; + + ++t ; + if ( iter != -1 && t >= iter ) + break ; + } + delete[] z ; + delete[] oneMinusZ ; + return 0 ; +} + +bool IsParametersTheSame( double *k, double *theta ) +{ + if ( ABS( k[0] - k[1] ) < 1e-2 && ABS( theta[0] - theta[1] ) < 1e-2 ) + return true ; + return false ; +} + +int RatioAndCovEM( double *covRatio, double *cov, int n, double &piRatio, double kRatio[2], + double thetaRatio[2], double &piCov, double kCov[2], double thetaCov[2] ) +{ + int i ; + piRatio = 0.6 ; // mixture coefficient for model 0 and 1 + kRatio[0] = 0.9 ; + kRatio[1] = 0.45 ; + thetaRatio[0] = 0.05 ; + thetaRatio[1] = 1 ; + double meanBound[2] = {-1, 1} ; // [0] is for the lower bound of the noise model, [1] is for the upper bound of the true model + + /*double *filteredCovRatio = new double[n] ;// ignore the ratio that is greater than 5. + int m = 0 ; + for ( i = 0 ; i < n ; ++i ) + if ( covRatio[i] < 1.0 ) + { + filteredCovRatio[m] = covRatio[i] ; + ++m ; + }*/ + srand( 17 ) ; + int maxTries = 10 ; + int t = 0 ; + double *buffer = new double[n] ; + for ( i = 0 ; i < n ; ++i ) + buffer[i] = covRatio[i] ; + qsort( buffer, n, sizeof( double ), CompDouble ) ; + //covRatio = buffer ; + while ( 1 ) + { + //printf( "EM\n" ) ; + MixtureGammaEM( covRatio, n, piRatio, kRatio, thetaRatio, t, meanBound ) ; + //printf( "%lf %lf %lf %lf %lf\n", piRatio, kRatio[0], kRatio[1], thetaRatio[0], thetaRatio[1] ) ; + if ( piRatio > 0.999 || piRatio < 0.001 || IsParametersTheSame( kRatio, thetaRatio ) ) + { + ++t ; + if ( t > maxTries ) + break ; + piRatio = 0.6 ; + kRatio[0] += ( ( rand() * 0.5 - RAND_MAX ) / (double)RAND_MAX * 0.1 ) ; + if ( kRatio[0] <= 0 ) + kRatio[0] = 0.9 ; + kRatio[1] += ( ( rand() * 0.5 - RAND_MAX ) / (double)RAND_MAX * 0.1 ) ; + if ( kRatio[1] <= 0 ) + kRatio[1] = 0.45 ; + thetaRatio[0] += ( ( rand() * 0.5 - RAND_MAX ) / (double)RAND_MAX * 0.1 ) ; + if ( thetaRatio[0] <= 0 ) + thetaRatio[0] = 0.05 ; + thetaRatio[1] += ( ( rand() * 0.5 - RAND_MAX ) / (double)RAND_MAX * 0.1 ) ; + if ( thetaRatio[1] <= 0 ) + thetaRatio[1] = 1 ; + if ( kRatio[0] < kRatio[1] ) + { + if ( rand() & 1 ) + kRatio[0] = kRatio[1] ; + else + kRatio[1] = kRatio[0] ; + } + if ( kRatio[0] * thetaRatio[0] > kRatio[1] * thetaRatio[1] ) + { + thetaRatio[0] = kRatio[1] * thetaRatio[1] / kRatio[0] ; + } + //printf( "%lf %lf %lf %lf %lf\n", piRatio, kRatio[0], kRatio[1], thetaRatio[0], thetaRatio[1] ) ; + + continue ; + } + + break ; + } + //delete[] filteredCovRatio ; + if ( t > maxTries && piRatio > 0.999 ) + { + /*piRatio = 0.6 ; // mixture coefficient for model 0 and 1 + kRatio[0] = 0.9 ; + kRatio[1] = 0.45 ; + thetaRatio[0] = 0.05 ; + thetaRatio[1] = 1 ;*/ + piRatio = 0.999 ; + } + if ( IsParametersTheSame( kRatio, thetaRatio ) || piRatio <= 1e-3 ) + piRatio = 1e-3 ; + + piCov = piRatio ; // mixture coefficient for model 0 and 1 + kCov[0] = 0.9 ; + kCov[1] = 0.45 ; + thetaCov[0] = 3 ; + thetaCov[1] = 12 ; + + // only do one iteration of EM, so that pi does not change? + // But it seems it still better to run full EM. + meanBound[0] = 1.01 ; + meanBound[1] = -1 ; + + //printf( "for coverage:\n" ) ; + //piCov = 0.001000 ; + MixtureGammaEM( cov, n, piCov, kCov, thetaCov, 0, meanBound ) ; + //printf( "for coverage done\n" ) ; + piCov = piRatio ; + + delete []buffer ; + + return 0 ; +} + +double GetPValue( double x, double *k, double *theta ) +{ + int fault ; + double p ; + p = 1 - gammad( x / theta[0], k[0], &fault ) ; + return p ; +} + +// if x's value is less than the average of (k0-1)*theta0, then we force x=(k0-1)*theta0, +// the mode of the model 0. Of course, it does not affect when k0<=1 already. +double MixtureGammaAssignmentAdjust( double x, double pi, double* k, double *theta ) +{ + if ( x < ( k[0] - 1 ) * theta[0] ) + { + x = ( k[0] - 1 ) * theta[0] ; + } + return MixtureGammaAssignment( x, pi, k, theta ) ; +} + + +// Transform the cov number for better fitting +double TransformCov( double c ) +{ + double ret ; + // original it is c-1. + // Use -2 instead of -1 is that many region covered to 1 reads will be filtered when + // build the subexons. + // + //ret = sqrt( c ) - 1 ; + if ( c <= 2 + 1e-6 ) + ret = 1e-6 ; + else + ret = c - 2 ; + return ret ; + //return log( c ) / log( 2.0 ) ; +} + +int main( int argc, char *argv[] ) +{ + int i, j ; + bool noStats = false ; + if ( argc < 3 ) + { + fprintf( stderr, usage ) ; + exit( 1 ) ; + } + + gMinDepth = 2 ; + + for ( i = 3 ; i < argc ; ++i ) + { + if ( !strcmp( argv[i], "--noStats" ) ) + { + noStats = true ; + continue ; + } + else if ( !strcmp( argv[i], "--minDepth" ) ) + { + gMinDepth = atoi( argv[i + 1] ) ; + ++i ; + continue ; + } + else + { + fprintf( stderr, "Unknown argument: %s\n", argv[i] ) ; + return 0 ; + } + } + + Alignments alignments ; + alignments.Open( argv[1] ) ; + std::vector<struct _splitSite> splitSites ; // only compromised the + std::vector<struct _splitSite> allSplitSites ; + + // read in the splice site + FILE *fp ; + fp = fopen( argv[2], "r" ) ; + char chrom[50] ; + int64_t start, end ; + int support ; + char strand[3] ; + int uniqSupport, secondarySupport, uniqEditDistance, secondaryEditDistance ; + while ( fscanf( fp, "%s %" PRId64 " %" PRId64 " %d %s %d %d %d %d", chrom, &start, &end, &support, strand, + &uniqSupport, &secondarySupport, &uniqEditDistance, &secondaryEditDistance ) != EOF ) + { + if ( support <= 0 ) + continue ; + //if ( !( uniqSupport >= 1 + // || secondarySupport > 10 ) ) + //if ( uniqSupport <= 0.01 * ( uniqSupport + secondarySupport ) || ( uniqSupport == 0 && secondarySupport < 20 ) ) + //if ( uniqSupport == 0 && secondarySupport <= 10 ) + // continue ; + int chrId = alignments.GetChromIdFromName( chrom ) ; + struct _splitSite ss ; + --start ; + --end ; + ss.pos = start ; + ss.chrId = chrId ; + ss.type = 2 ; + ss.oppositePos = end ; + ss.strand = strand[0] ; + ss.support = support ; + ss.uniqSupport = uniqSupport ; + ss.mismatchSum = uniqEditDistance + secondaryEditDistance ; + splitSites.push_back( ss ) ; + + ss.pos = end ; + ss.type = 1 ; + ss.oppositePos = start ; + ss.strand = strand[0] ; + ss.support = support ; + ss.uniqSupport = uniqSupport ; + ss.mismatchSum = uniqEditDistance + secondaryEditDistance ; + splitSites.push_back( ss ) ; + } + fclose( fp ) ; + //printf( "ss:%d\n", splitSites.size() ) ; + + //printf( "ss:%d\n", splitSites.size() ) ; + + alignments.GetGeneralInfo( true ) ; + // Build the blocks + Blocks regions ; + alignments.Rewind() ; + regions.BuildExonBlocks( alignments ) ; + //printf( "%d\n", regions.exonBlocks.size() ) ; + + FilterAndSortSplitSites( splitSites ) ; + FilterNearSplitSites( splitSites ) ; + FilterRepeatSplitSites( splitSites ) ; + regions.FilterSplitSitesInRegions( splitSites ) ; + regions.FilterGeneMergeSplitSites( splitSites ) ; + + + allSplitSites = splitSites ; + KeepUniqSplitSites( splitSites ) ; + + //for ( i = 0 ; i < splitSites.size() ; ++i ) + // printf( "%d %d\n", splitSites[i].pos + 1, splitSites[i].oppositePos + 1 ) ; + // Split the blocks using split site + regions.SplitBlocks( alignments, splitSites ) ; + //printf( "%d\n", regions.exonBlocks.size() ) ; + /*for ( i = 0 ; i < regions.exonBlocks.size() ; ++i ) + { + struct _block &e = regions.exonBlocks[i] ; + printf( "%s %" PRId64 " %" PRId64 " %d %d\n", alignments.GetChromName( e.chrId ), e.start + 1, e.end + 1, e.leftType, e.rightType ) ; + } + return 0 ;*/ + // Recompute the coverage for each block. + alignments.Rewind() ; + //printf( "Before computeDepth: %d\n", regions.exonBlocks.size() ) ; + + regions.ComputeDepth( alignments ) ; + //printf( "After computeDepth: %d\n", regions.exonBlocks.size() ) ; + + // Merge blocks that may have a hollow coverage by accident. + regions.MergeNearBlocks() ; + //printf( "After merge: %d\n", regions.exonBlocks.size() ) ; + + // Put the intron informations + regions.AddIntronInformation( allSplitSites, alignments ) ; + //printf( "After add information.\n" ) ; + + // Compute the average ratio against the left and right connected subexons. + regions.ComputeRatios() ; + //printf( "After compute ratios.\n" ) ; + + //printf( "Finish building regions.\n" ) ; + if ( noStats ) + { + // just output the subexons. + if ( realpath( argv[1], buffer ) == NULL ) + { + strcpy( buffer, argv[1] ) ; + } + printf( "#%s\n", buffer ) ; + printf( "#fitted_ir_parameter_ratio: pi: -1 k0: -1 theta0: -1 k1: -1 theta1: -1\n" ) ; + printf( "#fitted_ir_parameter_cov: pi: -1 k0: -1 theta0: -1 k1: -1 theta1: -1\n" ) ; + + int blockCnt = regions.exonBlocks.size() ; + for ( int i = 0 ; i < blockCnt ; ++i ) + { + struct _block &e = regions.exonBlocks[i] ; + double avgDepth = (double)e.depthSum / ( e.end - e.start + 1 ) ; + printf( "%s %" PRId64 " %" PRId64 " %d %d %lf -1 -1 -1 -1 ", alignments.GetChromName( e.chrId ), e.start + 1, e.end + 1, e.leftType, e.rightType, avgDepth ) ; + int prevCnt = e.prevCnt ; + if ( i > 0 && e.start == regions.exonBlocks[i - 1].end + 1 && + e.leftType == regions.exonBlocks[i - 1].rightType ) + { + printf( "%d ", prevCnt + 1 ) ; + for ( j = 0 ; j < prevCnt ; ++j ) + printf( "%" PRId64 " ", regions.exonBlocks[ e.prev[j] ].end + 1 ) ; + printf( "%" PRId64 " ", regions.exonBlocks[i - 1].end + 1 ) ; + } + else + { + printf( "%d ", prevCnt ) ; + for ( j = 0 ; j < prevCnt ; ++j ) + printf( "%" PRId64 " ", regions.exonBlocks[ e.prev[j] ].end + 1 ) ; + } + + int nextCnt = e.nextCnt ; + if ( i < blockCnt - 1 && e.end == regions.exonBlocks[i + 1].start - 1 && + e.rightType == regions.exonBlocks[i + 1].leftType ) + { + printf( "%d %" PRId64 " ", nextCnt + 1, regions.exonBlocks[i + 1].start + 1 ) ; + } + else + printf( "%d ", nextCnt ) ; + for ( j = 0 ; j < nextCnt ; ++j ) + printf( "%" PRId64 " ", regions.exonBlocks[ e.next[j] ].start + 1 ) ; + printf( "\n" ) ; + + } + return 0 ; + } + + // Extract the blocks for different events. + int blockCnt = regions.exonBlocks.size() ; + std::vector<struct _block> irBlocks ; // The regions corresponds to intron retention events. + double *leftClassifier = new double[ blockCnt ] ; + double *rightClassifier = new double[ blockCnt ] ; + std::vector<struct _block> overhangBlocks ; //blocks like (...[ or ]...) + std::vector<struct _block> islandBlocks ; // blocks like (....) + + for ( i = 0 ; i < blockCnt ; ++i ) + { + int ltype = regions.exonBlocks[i].leftType ; + int rtype = regions.exonBlocks[i].rightType ; + leftClassifier[i] = -1 ; + rightClassifier[i] = -1 ; + + //double avgDepth = (double)regions.exonBlocks[i].depthSum / ( regions.exonBlocks[i].end - regions.exonBlocks[i].start + 1 ) ; + + if ( ltype == 2 && rtype == 1 ) + { + // candidate intron retention. + // Note that when I compute the ratio, it is already made sure that the avgDepth>1. + double ratio = regions.PickLeftAndRightRatio( regions.exonBlocks[i] ) ; + + //printf( "%lf %lf\n", regions.exonBlocks[i].leftRatio, regions.exonBlocks[i].rightRatio ) ; + if ( ratio > 0 ) + { + regions.exonBlocks[i].ratio = ratio ; + irBlocks.push_back( regions.exonBlocks[i] ) ; + irBlocks[ irBlocks.size() - 1 ].contigId = i ; + } + } + else if ( ( ltype == 0 && rtype == 1 ) || ( ltype == 2 && rtype == 0 ) ) + { + // subexons like (...[ or ]...) + double ratio ; + if ( ltype == 0 ) + { + ratio = regions.exonBlocks[i].rightRatio ; + } + else if ( ltype == 2 ) + { + ratio = regions.exonBlocks[i].leftRatio ; + } + if ( ratio > 0 ) + { + regions.exonBlocks[i].ratio = ratio ; + overhangBlocks.push_back( regions.exonBlocks[i] ) ; + overhangBlocks[ overhangBlocks.size() - 1].contigId = i ; + } + } + else if ( ltype == 0 && rtype == 0 ) + { + islandBlocks.push_back( regions.exonBlocks[i] ) ; + islandBlocks[ islandBlocks.size() - 1].contigId = i ; + } + } + + // Compute the histogram for each intron. + int irBlockCnt = irBlocks.size() ; + double *cov = new double[irBlockCnt] ; + double *covRatio = new double[ irBlockCnt ] ; + double piRatio, kRatio[2], thetaRatio[2] ; + double piCov, kCov[2], thetaCov[2] ; + for ( i = 0 ; i < irBlockCnt ; ++i ) + { + double avgDepth = regions.GetAvgDepth( irBlocks[i] ) ; + //cov[i] = ( avgDepth - 1 ) / ( flankingAvg - 1 ) ; + cov[i] = TransformCov( avgDepth ) ; + + covRatio[i] = regions.PickLeftAndRightRatio( irBlocks[i] ) ; + //cov[i] = avgDepth / anchorAvg ; + //printf( "%"PRId64" %d %d: %lf %lf\n", irBlocks[i].depthSum, irBlocks[i].start, irBlocks[i].end, avgDepth, cov[i] ) ; + } + + /*fp = fopen( "ratio.out", "r" ) ; + int irBlockCnt = 0 ; + double *cov = new double[10000] ; + while ( 1 ) + { + double r ; + if ( fscanf( fp, "%lf", &r ) == EOF ) + break ; + cov[ irBlockCnt ] = r ; + ++irBlockCnt ; + } + fclose( fp ) ;*/ + RatioAndCovEM( covRatio, cov, irBlockCnt, piRatio, kRatio, thetaRatio, piCov, kCov, thetaCov ) ; + + for ( i = 0 ; i < irBlockCnt ; ++i ) + { + //double lf0 = -k[0] * log( theta[0] ) + ( k[0] - 1 ) * log( cov[i]) - cov[i] / theta[0] - lgamma( k[0] ) ; + //double lf1 = -k[1] * log( theta[1] ) + ( k[1] - 1 ) * log( cov[i]) - cov[i] / theta[1] - lgamma( k[1] ) ; + + double p1, p2, p ; + + p1 = MixtureGammaAssignmentAdjust( covRatio[i], piRatio, kRatio, thetaRatio ) ; + p2 = MixtureGammaAssignmentAdjust( cov[i], piCov, kCov, thetaCov ) ; + + /*p1 = GetPValue( covRatio[i], kRatio, thetaRatio ) ; //1 - gammad( covRatio[i] / thetaRatio[0], kRatio[0], &fault ) ; + if ( piRatio != 0 ) + p2 = GetPValue( cov[i], kCov, thetaCov ) ;//1 - gammad( cov[i] / thetaCov[0], kCov[0], &fault ) ; + else + p2 = p1 ;*/ + //printf( "%lf %lf: %lf %lf\n", covRatio[i], cov[i], p1, p2 ) ; + p = p1 > p2 ? p1 : p2 ; + + + //printf( "%d %d: avg: %lf ratio: %lf p: %lf\n", irBlocks[i].start, irBlocks[i].end, irBlocks[i].depthSum / (double)( irBlocks[i].end - irBlocks[i].start + 1 ), covRatio[i], + // p ) ; + leftClassifier[ irBlocks[i].contigId ] = p ; + rightClassifier[ irBlocks[i].contigId ] = p ; + } + + // Process the classifier for overhang subexons and the subexons to see whether we need soft boundary + int overhangBlockCnt = overhangBlocks.size() ; + delete []cov ; + delete []covRatio ; + + cov = new double[ overhangBlockCnt ] ; + covRatio = new double[overhangBlockCnt] ; + double overhangPiRatio, overhangKRatio[2], overhangThetaRatio[2] ; + double overhangPiCov, overhangKCov[2], overhangThetaCov[2] ; + + for ( i = 0 ; i < overhangBlockCnt ; ++i ) + { + covRatio[i] = overhangBlocks[i].ratio ; + cov[i] = TransformCov( regions.GetAvgDepth( overhangBlocks[i] ) ) ; + } + RatioAndCovEM( covRatio, cov, overhangBlockCnt, overhangPiRatio, overhangKRatio, overhangThetaRatio, overhangPiCov, overhangKCov, overhangThetaCov ) ; + + for ( i = 0 ; i < overhangBlockCnt ; ++i ) + { + //double lf0 = -k[0] * log( theta[0] ) + ( k[0] - 1 ) * log( cov[i]) - cov[i] / theta[0] - lgamma( k[0] ) ; + //double lf1 = -k[1] * log( theta[1] ) + ( k[1] - 1 ) * log( cov[i]) - cov[i] / theta[1] - lgamma( k[1] ) ; + + double p1, p2, p ; + p1 = MixtureGammaAssignmentAdjust( covRatio[i], overhangPiRatio, overhangKRatio, overhangThetaRatio ) ; + p2 = MixtureGammaAssignmentAdjust( cov[i], overhangPiCov, overhangKCov, overhangThetaCov ) ; + + /*p1 = GetPValue( covRatio[i], overhangKRatio, overhangThetaRatio ) ; //1 - gammad( covRatio[i] / thetaRatio[0], kRatio[0], &fault ) ; + if ( overhangPiRatio != 0) + p2 = GetPValue( cov[i], overhangKCov, overhangThetaCov ) ;//1 - gammad( cov[i] / thetaCov[0], kCov[0], &fault ) ; + else + p2 = p1 ;*/ + + //p = p1 < p2 ? p1 : p2 ; + p = sqrt( p1 * p2 ) ; + + int idx = overhangBlocks[i].contigId ; + if ( regions.exonBlocks[idx].rightType == 0 ) + leftClassifier[ idx ] = rightClassifier[ idx ] = p ; + else + leftClassifier[ idx ] = rightClassifier[ idx ] = p ; + } + + for ( i = 0 ; i < blockCnt ; ++i ) + { + struct _block &e = regions.exonBlocks[i] ; + //if ( ( e.leftType == 0 && e.rightType == 1 ) || + // variance-stabailizing transformation of poisson distribution. But we are more conservative here. + // The multiply 2 before that is because we ignore the region below 0, so we need to somehow renormalize the distribution. + if ( e.leftType == 1 ) + { + if ( e.leftRatio >= 0 ) + leftClassifier[i] = 2 * alnorm( e.leftRatio * 2.0 , true ) ; + else + leftClassifier[i] = 1 ; + } + /*else if ( e.leftType == 0 ) + { + // If this region is a start of a gene, the other sample might introduce + // a new intron before it. So we want to test whether this region can still + // be a start of a gene even there is an intron before it. + for ( j = i + 1 ; j < blockCnt ; ++j ) + { + if ( regions.exonBlocks[j].contigId != regions.exonBlocks[i].contigId ) + break ; + } + + for ( k = i ; k < j ; ++k ) + if ( regions.exonBlocks[j].leftType == 1 ) + break ; + if ( k >= j ) + { + leftClassifier[i] = alnorm( ) + } + }*/ + + //if ( ( e.rightType == 0 && e.leftType == 2 ) || + if ( e.rightType == 2 ) + { + if ( e.rightRatio >= 0 ) + rightClassifier[i] = 2 * alnorm( e.rightRatio * 2.0, true ) ; + else + rightClassifier[i] = 1 ; + } + } + + // Process the result for subexons seems like single-exon transcript (...) + int islandBlockCnt = islandBlocks.size() ; + //std::sort( islandBlocks.begin(), islandBlocks.end(), CompBlocksByAvgDepth ) ; + for ( i = 0, j = 0 ; i < islandBlockCnt ; ++i ) + { + /*if ( regions.GetAvgDepth( islandBlocks[i] ) != regions.GetAvgDepth( islandBlocks[j] ) ) + j = i ; + leftClassifier[ islandBlocks[i].contigId ] = 1 - (j + 1) / (double)( islandBlockCnt + 1 ) ; + rightClassifier[ islandBlocks[i].contigId ] = 1 - (j + 1) / (double)( islandBlockCnt + 1 ) ;*/ + double p = GetPValue( TransformCov( regions.GetAvgDepth( islandBlocks[i] ) ), kCov, thetaCov ) ; + leftClassifier[ islandBlocks[i].contigId ] = p ; + rightClassifier[ islandBlocks[i].contigId ] = p ; + } + + + // Output the result. + if ( realpath( argv[1], buffer ) == NULL ) + { + strcpy( buffer, argv[1] ) ; + } + printf( "#%s\n", buffer ) ; + // TODO: higher precision. + printf( "#fitted_ir_parameter_ratio: pi: %lf k0: %lf theta0: %lf k1: %lf theta1: %lf\n", piRatio, kRatio[0], thetaRatio[0], kRatio[1], thetaRatio[1] ) ; + printf( "#fitted_ir_parameter_cov: pi: %lf k0: %lf theta0: %lf k1: %lf theta1: %lf\n", piCov, kCov[0], thetaCov[0], kCov[1], thetaCov[1] ) ; + + printf( "#fitted_overhang_parameter_ratio: pi: %lf k0: %lf theta0: %lf k1: %lf theta1: %lf\n", overhangPiRatio, overhangKRatio[0], overhangThetaRatio[0], overhangKRatio[1], overhangThetaRatio[1] ) ; + printf( "#fitted_overhang_parameter_cov: pi: %lf k0: %lf theta0: %lf k1: %lf theta1: %lf\n", overhangPiCov, overhangKCov[0], overhangThetaCov[0], overhangKCov[1], overhangThetaCov[1] ) ; + + + for ( int i = 0 ; i < blockCnt ; ++i ) + { + struct _block &e = regions.exonBlocks[i] ; + double avgDepth = regions.GetAvgDepth( e ) ; + printf( "%s %" PRId64 " %" PRId64 " %d %d %c %c %lf %lf %lf %lf %lf ", alignments.GetChromName( e.chrId ), e.start + 1, e.end + 1, e.leftType, e.rightType, + e.leftStrand, e.rightStrand, avgDepth, + e.leftRatio, e.rightRatio, leftClassifier[i], rightClassifier[i] ) ; + int prevCnt = e.prevCnt ; + if ( i > 0 && e.start == regions.exonBlocks[i - 1].end + 1 ) + //&& e.leftType == regions.exonBlocks[i - 1].rightType ) + { + printf( "%d ", prevCnt + 1 ) ; + for ( j = 0 ; j < prevCnt ; ++j ) + printf( "%" PRId64 " ", regions.exonBlocks[ e.prev[j] ].end + 1 ) ; + printf( "%" PRId64 " ", regions.exonBlocks[i - 1].end + 1 ) ; + } + else + { + printf( "%d ", prevCnt ) ; + for ( j = 0 ; j < prevCnt ; ++j ) + printf( "%" PRId64 " ", regions.exonBlocks[ e.prev[j] ].end + 1 ) ; + } + + int nextCnt = e.nextCnt ; + if ( i < blockCnt - 1 && e.end == regions.exonBlocks[i + 1].start - 1 ) + //&& e.rightType == regions.exonBlocks[i + 1].leftType ) + { + printf( "%d %" PRId64 " ", nextCnt + 1, regions.exonBlocks[i + 1].start + 1 ) ; + } + else + printf( "%d ", nextCnt ) ; + for ( j = 0 ; j < nextCnt ; ++j ) + printf( "%" PRId64 " ", regions.exonBlocks[ e.next[j] ].start + 1 ) ; + printf( "\n" ) ; + } + + delete[] cov ; + delete[] covRatio ; + delete[] leftClassifier ; + delete[] rightClassifier ; +}