Mercurial > repos > lsong10 > psiclass
diff PsiCLASS-1.0.2/blocks.hpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/blocks.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,1580 @@ +// The class manage the blocks +// Li Song + +#ifndef _LSONG_CLASSES_BLOCKS_HEADER +#define _LSONG_CLASSES_BLOCKS_HEADER + +#include <stdlib.h> +#include <vector> +#include <map> +#include <assert.h> +#include <math.h> +#include <set> +#include <inttypes.h> + +#include "defs.h" + +extern bool VERBOSE ; +extern FILE *fpOut ; + +extern int gMinDepth ; + + +struct _splitSite // means the next position belongs to another block +{ + int64_t pos ; + char strand ; + int chrId ; + int type ; // 1-start of an exon. 2-end of an exon. + + int support ; + int uniqSupport ; + + int mismatchSum ; + + int64_t oppositePos ; // the position of the other sites to form the intron. +} ; + +struct _block +{ + int chrId ; + int contigId ; + int64_t start, end ; + int64_t leftSplice, rightSplice ; // Some information about the left splice site and right splice site. + // record the leftmost and rightmost coordinates of a splice site within this block + //or the length of read alignment support the splice sites. + //Or the number of support. + int64_t depthSum ; + + int leftType, rightType ; //0-soft boundary, 1-start of an exon, 2-end of an exon. + + double leftRatio, rightRatio ; // the adjusted ratio-like value to the left and right anchor subexons. + double ratio ; + + char leftStrand, rightStrand ; + + int *depth ; + int prevCnt, nextCnt ; // Some connection information for the subexons. + int *prev ; + int *next ; +} ; + +// adjacent graph +struct _adj +{ + int ind ; + int info ; + int support ; + int next ; +} ; + +class Blocks +{ + private: + std::map<int, int> exonBlocksChrIdOffset ; + + int64_t Overlap( int64_t s0, int64_t e0, int64_t s1, int64_t e1, int64_t &s, int64_t &e ) + { + s = e = -1 ; + if ( e0 < s1 || s0 > e1 ) + return 0 ; + s = s0 > s1 ? s0 : s1 ; + e = e0 < e1 ? e0 : e1 ; + return e - s + 1 ; + } + + void Split( const char *s, char delimit, std::vector<std::string> &fields ) + { + int i ; + fields.clear() ; + if ( s == NULL ) + return ; + + std::string f ; + for ( i = 0 ; s[i] ; ++i ) + { + if ( s[i] == delimit || s[i] == '\n' ) + { + fields.push_back( f ) ; + f.clear() ; + } + else + f.append( 1, s[i] ) ; + } + fields.push_back( f ) ; + f.clear() ; + } + + void BuildBlockChrIdOffset() + { + // Build the map for the offsets of chr id in the exonBlock list. + exonBlocksChrIdOffset[ exonBlocks[0].chrId] = 0 ; + int cnt = exonBlocks.size() ; + for ( int i = 1 ; i < cnt ; ++i ) + { + if ( exonBlocks[i].chrId != exonBlocks[i - 1].chrId ) + exonBlocksChrIdOffset[ exonBlocks[i].chrId ] = i ; + } + } + + bool CanReach( int from, int to, struct _adj *adj, bool *visited ) + { + if ( visited[from] ) + return false ; + visited[from] = true ; + int p ; + p = adj[from].next ; + while ( p != -1 ) + { + if ( adj[p].ind == to ) + return true ; + if ( adj[p].ind < to + && CanReach( adj[p].ind, to, adj, visited ) ) + return true ; + p = adj[p].next ; + } + return false ; + } + + void AdjustAndCreateExonBlocks( int tag, std::vector<struct _block> &newExonBlocks ) + { + int i, j ; + if ( exonBlocks[tag].depth != NULL ) + { + // Convert the increment and decrement into actual depth. + int len = exonBlocks[tag].end - exonBlocks[tag].start + 1 ; + int *depth = exonBlocks[tag].depth ; + for ( i = 1 ; i < len ; ++i ) + depth[i] = depth[i - 1] + depth[i] ; + + struct _block island ; // the portion created by the hollow. + island.start = island.end = -1 ; + island.depthSum = 0 ; + + /*if ( exonBlocks[tag].start == 1562344 ) + { + for ( i = 0 ; i < len ; ++i ) + printf( "%d\n", depth[i] ) ; + }*/ + // Adjust boundary accordingly. + int64_t adjustStart = exonBlocks[tag].start ; + int64_t adjustEnd = exonBlocks[tag].end ; + + if ( exonBlocks[tag].leftType == 0 && exonBlocks[tag].rightType != 0 ) + { + for ( i = len - 1 ; i >= 0 ; --i ) + if ( depth[i] < gMinDepth ) + break ; + ++i ; + if ( exonBlocks[tag].rightType == 2 && i + exonBlocks[tag].start > exonBlocks[tag].rightSplice ) + i = exonBlocks[tag].rightSplice - exonBlocks[tag].start ; + adjustStart = i + exonBlocks[tag].start ; + + if ( adjustStart > exonBlocks[tag].start ) + { + int s, e ; + // firstly [s,e] is the range of depth array. + for ( s = 0 ; s < i ; ++s ) + if ( depth[s] >= gMinDepth ) + break ; + for ( e = i - 1 ; e >= s ; -- e ) + if ( depth[e] >= gMinDepth ) + break ; + if ( e >= s ) + { + island = exonBlocks[tag] ; + island.depthSum = 0 ; + island.leftType = island.rightType = 0 ; + for ( j = s ; j <= e ; ++j ) + island.depthSum += depth[j] ; + island.start = s + exonBlocks[tag].start ; // offset the coordinate. + island.end = e + exonBlocks[tag].start ; + } + } + } + else if ( exonBlocks[tag].leftType != 0 && exonBlocks[tag].rightType == 0 ) + { + for ( i = 0 ; i < len ; ++i ) + if ( depth[i] < gMinDepth ) + break ; + --i ; + if ( exonBlocks[tag].leftType == 1 && i + exonBlocks[tag].start < exonBlocks[tag].leftSplice ) + i = exonBlocks[tag].leftSplice - exonBlocks[tag].start ; + adjustEnd = i + exonBlocks[tag].start ; + + if ( adjustEnd < exonBlocks[tag].end ) + { + int s, e ; + // firstly [s,e] is the range of depth array. + for ( s = i + 1 ; s < len ; ++s ) + if ( depth[s] >= gMinDepth ) + break ; + for ( e = len - 1 ; e >= s ; --e ) + if ( depth[e] >= gMinDepth ) + break ; + if ( e >= s ) + { + island = exonBlocks[tag] ; + island.depthSum = 0 ; + island.leftType = island.rightType = 0 ; + for ( j = s ; j <= e ; ++j ) + island.depthSum += depth[j] ; + island.start = s + exonBlocks[tag].start ; // offset the coordinate. + island.end = e + exonBlocks[tag].start ; + } + } + } + else if ( exonBlocks[tag].leftType == 0 && exonBlocks[tag].rightType == 0 ) + { + for ( i = 0 ; i < len ; ++i ) + if ( depth[i] >= gMinDepth ) + break ; + adjustStart = i + exonBlocks[tag].start ; + + for ( i = len - 1 ; i >= 0 ; --i ) + if ( depth[i] >= gMinDepth ) + break ; + adjustEnd = i + exonBlocks[tag].start ; + } + else if ( exonBlocks[tag].leftType == 1 && exonBlocks[tag].rightType == 2 + && exonBlocks[tag].end - exonBlocks[tag].start + 1 >= 1000 ) + { + // The possible merge of two genes or merge of UTRs, if we don't break the low coverage part. + // If we decide to cut, I'll reuse the variable "island" to represent the subexon on right hand side. + int l ; + int gapSize = 30 ; + for ( i = 0 ; i <= len - ( gapSize + 1 ) ; ++i ) + { + if ( depth[i] < gMinDepth ) + { + for ( l = i ; l < i + ( gapSize + 1 ) ; ++l ) + if ( depth[l] >= gMinDepth ) + break ; + if ( l < i + ( gapSize + 1 ) ) + { + i = l - 1 ; + continue ; + } + else + break ; + } + } + + for ( j = len - 1 ; j >= ( gapSize + 1 ) - 1 ; --j ) + { + if ( depth[j] < gMinDepth ) + { + for ( l = j ; l >= j - ( gapSize + 1 ) + 1 ; --l ) + if ( depth[l] >= gMinDepth ) + break ; + + if ( l >= j - ( gapSize + 1 ) + 1 ) + { + j = l + 1 ; + continue ; + } + else + break ; + } + } + + if ( j - i + 1 > gapSize ) + { + // we break. + --i ; ++j ; + + adjustEnd = i + exonBlocks[tag].start ; + + if ( j < len - 1 ) + { + island = exonBlocks[tag] ; + island.depthSum = 0 ; + island.leftType = 0 ; + for ( l = j ; l <= len - 1 ; ++l ) + island.depthSum += depth[j] ; + island.start = j + exonBlocks[tag].start ; // offset the coordinate. + island.end = exonBlocks[tag].end ; + } + + exonBlocks[tag].rightType = 0 ; // put it here, so the island can get the right info + //fprintf( stderr, "break %d %d %d %d.\n", (int)exonBlocks[tag].start, (int)adjustEnd, i + (int)exonBlocks[tag].start, j + (int)exonBlocks[tag].start ) ; + } + } + + int lostDepthSum = 0 ; + for ( i = exonBlocks[tag].start ; i < adjustStart ; ++i ) + lostDepthSum += depth[i - exonBlocks[tag].start ] ; + for ( i = adjustEnd + 1 ; i < exonBlocks[tag].end ; ++i ) + lostDepthSum += depth[i - exonBlocks[tag].start ] ; + exonBlocks[tag].depthSum -= lostDepthSum ; + + delete[] exonBlocks[tag].depth ; + exonBlocks[tag].depth = NULL ; + + //if ( exonBlocks[tag].start == 1562344 ) + // printf( "%d %d\n", adjustStart, adjustEnd ) ; + if ( ( len > 1 && adjustEnd - adjustStart + 1 <= 1 ) || ( adjustEnd - adjustStart + 1 <= 0 ) ) + return ; + exonBlocks[tag].start = adjustStart ; + exonBlocks[tag].end = adjustEnd ; + + if ( island.start != -1 && island.end < exonBlocks[tag].start ) + newExonBlocks.push_back( island ) ; + + newExonBlocks.push_back( exonBlocks[tag] ) ; + if ( island.start != -1 && island.start > exonBlocks[tag].end ) + newExonBlocks.push_back( island ) ; + } + } + public: + std::vector<struct _block> exonBlocks ; + + Blocks() { } + ~Blocks() + { + int blockCnt = exonBlocks.size() ; + for ( int i = 0 ; i < blockCnt ; ++i ) + { + if ( exonBlocks[i].next != NULL ) + { + delete[] exonBlocks[i].next ; + } + if ( exonBlocks[i].prev != NULL ) + { + delete[] exonBlocks[i].prev ; + } + } + } + + double GetAvgDepth( const struct _block &block ) + { + return block.depthSum / (double)( block.end - block.start + 1 ) ; + } + + int BuildExonBlocks( Alignments &alignments ) + { + int tag = 0 ; + while ( alignments.Next() ) + { + int i, j, k ; + int segCnt = alignments.segCnt ; + struct _pair *segments = alignments.segments ; + int eid = 0 ; // the exonblock id that the segment update + + // locate the first exonblock that beyond the start of the read. + while ( tag < (int)exonBlocks.size() && ( exonBlocks[tag].end < segments[0].a - 1 + || exonBlocks[tag].chrId != alignments.GetChromId() ) ) + { + ++tag ; + } + // due to the merge of exon blocks, we might need roll back + --tag ; + while ( tag >= 0 && ( exonBlocks[tag].end >= segments[0].a - 1 + && exonBlocks[tag].chrId == alignments.GetChromId() ) ) + { + --tag ; + } + ++tag ; + + for ( i = 0 ; i < segCnt ; ++i ) + { + //if ( i == 0 ) + // printf( "hi %d %s %d %d\n", i, alignments.GetReadId(), segments[i].a, segments[i].b ) ; + for ( j = tag ; j < (int)exonBlocks.size() ; ++j ) + { + if ( exonBlocks[j].end >= segments[i].a - 1 ) + break ; + } + if ( j >= (int)exonBlocks.size() ) + { + // Append a new block + struct _block newSeg ; + newSeg.chrId = alignments.GetChromId() ; + newSeg.start = segments[i].a ; + newSeg.end = segments[i].b ; + + newSeg.leftSplice = -1 ; + newSeg.rightSplice = -1 ; + if ( i > 0 ) + newSeg.leftSplice = segments[i].a ; + if ( i < segCnt - 1 ) + newSeg.rightSplice = segments[i].b ; + + exonBlocks.push_back( newSeg ) ; + + eid = exonBlocks.size() - 1 ; + } + else if ( exonBlocks[j].end < segments[i].b || + ( exonBlocks[j].start > segments[i].a && exonBlocks[j].start <= segments[i].b + 1 ) ) + { + // If overlaps with a current exon block, so we extend it + if ( exonBlocks[j].end < segments[i].b ) + { + // extends toward right + exonBlocks[j].end = segments[i].b ; + if ( i > 0 && ( exonBlocks[j].leftSplice == -1 || segments[i].a < exonBlocks[j].leftSplice ) ) + exonBlocks[j].leftSplice = segments[i].a ; + if ( i < segCnt - 1 && segments[i].b > exonBlocks[j].rightSplice ) + exonBlocks[j].rightSplice = segments[i].b ; + eid = j ; + + // Merge with next few exon blocks + for ( k = j + 1 ; k < (int)exonBlocks.size() ; ++k ) + { + if ( exonBlocks[k].start <= exonBlocks[j].end + 1 ) + { + if ( exonBlocks[k].end > exonBlocks[j].end ) + exonBlocks[j].end = exonBlocks[k].end ; + + if ( exonBlocks[k].leftSplice != -1 && ( exonBlocks[j].leftSplice == -1 || exonBlocks[k].leftSplice < exonBlocks[j].leftSplice ) ) + exonBlocks[j].leftSplice = exonBlocks[k].leftSplice ; + if ( exonBlocks[k].rightSplice != -1 && exonBlocks[k].rightSplice > exonBlocks[j].rightSplice ) + exonBlocks[j].rightSplice = exonBlocks[k].rightSplice ; + + } + else + break ; + } + + if ( k > j + 1 ) + { + // Remove the merged blocks + int a, b ; + for ( a = j + 1, b = k ; b < (int)exonBlocks.size() ; ++a, ++b ) + exonBlocks[a] = exonBlocks[b] ; + for ( a = 0 ; a < k - ( j + 1 ) ; ++a ) + exonBlocks.pop_back() ; + } + } + else if ( exonBlocks[j].start > segments[i].a && exonBlocks[j].start <= segments[i].b + 1 ) + { + // extends toward left + exonBlocks[j].start = segments[i].a ; + if ( i > 0 && ( exonBlocks[j].leftSplice == -1 || segments[i].a < exonBlocks[j].leftSplice ) ) + exonBlocks[j].leftSplice = segments[i].a ; + if ( i < segCnt - 1 && segments[i].b > exonBlocks[j].rightSplice ) + exonBlocks[j].rightSplice = segments[i].b ; + eid = j ; + + // Merge with few previous exon blocks + for ( k = j - 1 ; k >= 0 ; --k ) + { + if ( exonBlocks[k].end >= exonBlocks[k + 1].start - 1 ) + { + if ( exonBlocks[k + 1].start < exonBlocks[k].start ) + { + exonBlocks[k].start = exonBlocks[k + 1].start ; + } + + if ( exonBlocks[k].leftSplice != -1 && ( exonBlocks[j].leftSplice == -1 || exonBlocks[k].leftSplice < exonBlocks[j].leftSplice ) ) + exonBlocks[j].leftSplice = exonBlocks[k].leftSplice ; + if ( exonBlocks[k].rightSplice != -1 && exonBlocks[k].rightSplice > exonBlocks[j].rightSplice ) + exonBlocks[j].rightSplice = exonBlocks[k].rightSplice ; + + } + else + break ; + } + + if ( k < j - 1 ) + { + int a, b ; + eid = k + 1 ; + for ( a = k + 2, b = j + 1 ; b < (int)exonBlocks.size() ; ++a, ++b ) + exonBlocks[a] = exonBlocks[b] ; + for ( a = 0 ; a < ( j - 1 ) - k ; ++a ) + exonBlocks.pop_back() ; + + } + } + } + else if ( exonBlocks[j].start > segments[i].b + 1 ) + { + int size = exonBlocks.size() ; + int a ; + // No overlap, insert a new block + struct _block newSeg ; + newSeg.chrId = alignments.GetChromId() ; + newSeg.start = segments[i].a ; + newSeg.end = segments[i].b ; + + newSeg.leftSplice = -1 ; + newSeg.rightSplice = -1 ; + if ( i > 0 ) + newSeg.leftSplice = segments[i].a ; + if ( i < segCnt - 1 ) + newSeg.rightSplice = segments[i].b ; + + // Insert at position j + exonBlocks.push_back( newSeg ) ; + for ( a = size ; a > j ; --a ) + exonBlocks[a] = exonBlocks[a - 1] ; + exonBlocks[a] = newSeg ; + + eid = a ; + } + else + { + // The segment is contained in j + eid = -1 ; + } + + // Merge the block with the mate if the gap is very small + // Note that since reads are already sorted by coordinate, + // the previous exon blocks is built completely. + /*int64_t matePos ; + int mateChrId ; + alignments.GetMatePosition( mateChrId, matePos ) ; + if ( i == 0 && eid > 0 && mateChrId == exonBlocks[ eid ].chrId + && matePos < segments[0].a + && matePos >= exonBlocks[eid - 1].start + && matePos <= exonBlocks[eid - 1].end + && segments[0].a - matePos + 1 <= 500 + && exonBlocks[eid-1].chrId == exonBlocks[eid].chrId + && exonBlocks[eid].start - exonBlocks[eid - 1].end - 1 <= 30 ) + { + printf( "%d: (%d-%d) (%d-%d). %d %d\n", ( segments[0].a + alignments.readLen - 1 ) - matePos + 1, exonBlocks[eid - 1].start, exonBlocks[eid - 1].end, + exonBlocks[eid].start, + exonBlocks[eid].end, eid, exonBlocks.size() ) ; + exonBlocks[eid - 1].end = exonBlocks[eid].end ; + + if ( exonBlocks[eid].leftSplice != -1 && ( exonBlocks[eid - 1].leftSplice == -1 || exonBlocks[eid].leftSplice < exonBlocks[eid - 1].leftSplice ) ) + exonBlocks[eid - 1].leftSplice = exonBlocks[k].leftSplice ; + if ( exonBlocks[eid].rightSplice != -1 && exonBlocks[eid].rightSplice > exonBlocks[eid - 1].rightSplice ) + exonBlocks[eid - 1].rightSplice = exonBlocks[eid].rightSplice ; + + int size = exonBlocks.size() ; + for ( j = eid ; j < size - 1 ; ++j ) + exonBlocks[j] = exonBlocks[j + 1] ; + exonBlocks.pop_back() ; + }*/ + } + } + /*for ( int i = 0 ; i < (int)exonBlocks.size() ; ++i ) + { + printf( "%d %d\n", exonBlocks[i].start, exonBlocks[i].end ) ; + }*/ + + if ( exonBlocks.size() > 0 ) + { + BuildBlockChrIdOffset() ; + int cnt = exonBlocks.size() ; + for ( int i = 0 ; i < cnt ; ++i ) + { + exonBlocks[i].contigId = exonBlocks[i].chrId ; + + exonBlocks[i].leftType = 0 ; + exonBlocks[i].rightType = 0 ; + exonBlocks[i].depth = NULL ; + exonBlocks[i].nextCnt = 0 ; + exonBlocks[i].prevCnt = 0 ; + exonBlocks[i].leftStrand = '.' ; + exonBlocks[i].rightStrand = '.' ; + } + } + return exonBlocks.size() ; + } + + void FilterSplitSitesInRegions( std::vector<struct _splitSite> &sites ) + { + int i, j, k ; + int size = sites.size() ; + int bsize = exonBlocks.size() ; + int tag = 0 ; + + for ( i = 0 ; i < bsize ; ++i ) + { + while ( tag < size && ( sites[tag].chrId < exonBlocks[i].chrId || + ( sites[tag].chrId == exonBlocks[i].chrId && sites[tag].pos < exonBlocks[i].start ) ) ) + { + ++tag ; + } + if ( tag >= size ) + break ; + + if ( sites[tag].chrId != exonBlocks[i].chrId || sites[tag].pos > exonBlocks[i].end ) + continue ; + + for ( j = tag + 1 ; j < size ; ++j ) + if ( sites[j].chrId != exonBlocks[i].chrId || sites[j].pos > exonBlocks[i].end ) + break ; + + // [tag,j) holds the split sites in this region. + int leftStrandSupport[2] = {0, 0} ; + int rightStrandSupport[2] = {0, 0} ; + int strandCnt[2] = { 0, 0 } ; + for ( k = tag ; k < j ; ++k ) + { + if ( sites[k].strand == '-' ) + { + ++strandCnt[0] ; + if ( sites[k].oppositePos < exonBlocks[i].start ) + leftStrandSupport[0] += sites[k].support ; + else if ( sites[k].oppositePos > exonBlocks[i].end ) + rightStrandSupport[0] += sites[k].support ; + } + else if ( sites[k].strand == '+' ) + { + ++strandCnt[1] ; + if ( sites[k].oppositePos < exonBlocks[i].start ) + leftStrandSupport[1] += sites[k].support ; + else if ( sites[k].oppositePos > exonBlocks[i].end ) + rightStrandSupport[1] += sites[k].support ; + } + + } + + + if ( leftStrandSupport[0] == 0 && rightStrandSupport[0] == 0 + && leftStrandSupport[1] != 0 && rightStrandSupport[1] != 0 && strandCnt[0] == 2 ) + { + // check whether a different strand accidentally show up in this region. + bool remove = false ; + for ( k = tag ; k < j ; ++k ) + { + if ( sites[k].strand == '-' && + sites[k].oppositePos >= sites[k].pos && sites[k].oppositePos <= exonBlocks[i].end && + sites[k].support <= 2 ) + { + remove = true ; + break ; + } + } + + if ( remove ) + for ( k = tag ; k < j ; ++k ) + if ( sites[k].strand == '-' ) + sites[k].support = -1 ; + } + else if ( leftStrandSupport[1] == 0 && rightStrandSupport[1] == 0 + && leftStrandSupport[0] != 0 && rightStrandSupport[0] != 0 && strandCnt[1] == 2 ) + { + // check whether a different strand accidentally show up in this region. + bool remove = false ; + for ( k = tag ; k < j ; ++k ) + { + if ( sites[k].strand == '+' && + sites[k].oppositePos >= sites[k].pos && sites[k].oppositePos <= exonBlocks[i].end && + sites[k].support <= 2 ) + { + remove = true ; + break ; + } + } + + if ( remove ) + for ( k = tag ; k < j ; ++k ) + if ( sites[k].strand == '+' ) + sites[k].support = -1 ; + } + + tag = j ; + } + + k = 0 ; + for ( i = 0 ; i < size ; ++i ) + if ( sites[i].support > 0 ) + { + sites[k] = sites[i] ; + ++k ; + } + sites.resize( k ) ; + } + + // Filter the pair of split sites that is likely merge two genes. + // We filter the case like [..]------------------------[..] + // [..]--[..] [..]-[...] + void FilterGeneMergeSplitSites( std::vector<struct _splitSite> &sites ) + { + int i, j, k ; + int bsize = exonBlocks.size() ; + int ssize = sites.size() ; + + struct _pair32 *siteToBlock = new struct _pair32[ ssize ] ; + struct _adj *adj = new struct _adj[ ssize / 2 + bsize ] ; + bool *visited = new bool[bsize] ; + + int adjCnt = 0 ; + for ( i = 0 ; i < bsize ; ++i ) + { + adj[i].info = 0 ; // in the header, info means the number of next block + adj[i].support = 0 ; + adj[i].next = -1 ; + } + adjCnt = i ; + memset( siteToBlock, -1, sizeof( struct _pair32 ) * ssize ) ; + memset( visited, false, sizeof( bool ) * bsize ) ; + + // Build the graph. + k = 0 ; + for ( i = 0 ; i < bsize ; ++i ) + { + for ( ; k < ssize ; ++k ) + { + if ( sites[k].oppositePos < sites[k].pos ) + continue ; + if ( sites[k].chrId < exonBlocks[i].chrId + || ( sites[k].chrId == exonBlocks[i].chrId && sites[k].pos < exonBlocks[i].start ) ) + continue ; + break ; + } + + for ( ; k < ssize ; ++k ) + { + if ( sites[k].chrId > exonBlocks[i].chrId + || sites[k].pos > exonBlocks[i].end ) + break ; + + if ( sites[k].oppositePos <= exonBlocks[i].end ) // ignore self-loop + continue ; + + for ( j = i + 1 ; j < bsize ; ++j ) + if ( sites[k].oppositePos >= exonBlocks[j].start && sites[k].oppositePos <= exonBlocks[j].end ) + break ; + if ( sites[k].oppositePos >= exonBlocks[j].start && sites[k].oppositePos <= exonBlocks[j].end ) + { + int p ; + p = adj[i].next ; + while ( p != -1 ) + { + if ( adj[p].ind == j ) + { + ++adj[p].info ; + adj[p].support += sites[k].uniqSupport ; + break ; + } + p = adj[p].next ; + } + if ( p == -1 ) + { + adj[ adjCnt ].ind = j ; + adj[ adjCnt ].info = 1 ; + adj[ adjCnt ].support = sites[k].uniqSupport ; + adj[ adjCnt ].next = adj[i].next ; + adj[i].next = adjCnt ; + ++adj[i].info ; + ++adjCnt ; + } + + siteToBlock[k].a = i ; + siteToBlock[k].b = j ; + } + } + } + for ( k = 0 ; k < ssize ; ++k ) + { + if ( sites[k].oppositePos - sites[k].pos + 1 < 20000 || sites[k].uniqSupport >= 30 ) + continue ; + + int from = siteToBlock[k].a ; + int to = siteToBlock[k].b ; + if ( to - from - 1 < 2 || adj[from].info <= 1 ) + continue ; + + memset( &visited[from], false, sizeof( bool ) * ( to - from + 1 ) ) ; + + int cnt = 0 ; + int p = adj[from].next ; + int max = -1 ; + int maxP = 0 ; + while ( p != -1 ) + { + if ( adj[p].support >= 10 && adj[p].ind <= to ) + ++cnt ; + if ( adj[p].support > max || ( adj[p].support == max && adj[p].ind == to ) ) + { + max = adj[p].support ; + maxP = p ; + } + + if ( adj[p].ind == to && adj[p].info > 1 ) + { + cnt = 0 ; + break ; + } + p = adj[p].next ; + } + if ( cnt <= 1 ) + continue ; + + if ( max != -1 && adj[maxP].ind == to ) + continue ; + + // No other path can reach "to" + p = adj[from].next ; + while ( p != -1 ) + { + if ( adj[p].ind != to ) + { + //if ( sites[k].pos ==43917158 ) + // printf( "hi %d %d. (%d %d)\n", from, adj[p].ind, exonBlocks[ adj[p].ind ].start, exonBlocks[ adj[p].ind ].end ) ; + if ( CanReach( adj[p].ind, to, adj, visited ) ) + break ; + } + p = adj[p].next ; + } + if ( p != -1 ) + continue ; + + //if ( sites[k].pos == 34073267 ) + // printf( "hi %d %d. (%d %d)\n", from, to, exonBlocks[to].start, exonBlocks[to].end ) ; + + // There are some blocks between that can reach "to" + for ( i = from + 1 ; i < to ; ++i ) + { + p = adj[i].next ; + while ( p != -1 ) + { + if ( adj[p].ind == to && adj[p].support >= 10 ) + break ; + p = adj[p].next ; + } + if ( p != -1 ) + break ; + } + if ( i >= to ) + continue ; + + // Filter the site + //printf( "filtered: %d %d\n", sites[k].pos, sites[k].oppositePos ) ; + sites[k].support = -1 ; + for ( j = k + 1 ; j < ssize ; ++j ) + { + if ( sites[j].pos == sites[k].oppositePos && sites[j].oppositePos == sites[k].pos ) + sites[j].support = -1 ; + if ( sites[j].pos > sites[k].oppositePos ) + break ; + } + } + + k = 0 ; + for ( i = 0 ; i < ssize ; ++i ) + if ( sites[i].support > 0 ) + { + sites[k] = sites[i] ; + ++k ; + } + sites.resize( k ) ; + + // Filter the intron on the different strand of a gene. + ssize = k ; + k = 0 ; + for ( i = 0 ; i < bsize ; ) + { + int farthest = exonBlocks[i].end ; + + for ( j = i ; j < bsize ; ++j ) + { + if ( exonBlocks[j].start > farthest || exonBlocks[i].chrId != exonBlocks[j].chrId ) + break ; + int p = adj[i].next ; + while ( p != -1 ) + { + if ( exonBlocks[ adj[p].ind ].end > farthest ) + farthest = exonBlocks[ adj[p].ind ].end ; + p = adj[p].next ; + } + } + + for ( ; k < ssize ; ++k ) + { + if ( sites[k].chrId < exonBlocks[i].chrId || + ( sites[k].chrId == exonBlocks[i].chrId && sites[k].pos < exonBlocks[i].start ) ) + continue ; + break ; + } + + if ( sites[k].chrId > exonBlocks[i].chrId || sites[k].pos > farthest ) + { + i = j ; + continue ; + } + //printf( "%d %d. %d %d. %d %d\n", i, j, exonBlocks[i].start, farthest, k, sites[k].pos ) ; + + + int from = k ; + int to ; + for ( to = k ; to < ssize ; ++to ) + if ( sites[to].chrId != exonBlocks[i].chrId || sites[to].pos > farthest ) + break ; + + int strandSupport[2] = {0, 0}; + int strandCount[2] = {0, 0}; + for ( k = from ; k < to ; ++k ) + { + if ( sites[k].oppositePos <= sites[k].pos ) + continue ; + + int tag = ( sites[k].strand == '+' ? 1 : 0 ) ; + strandSupport[tag] += sites[k].support ; + ++strandCount[tag] ; + } + + // Not mixtured. + if ( strandCount[0] * strandCount[1] == 0 ) + { + i = j ; + k = to ; + continue ; + } + + char removeStrand = 0 ; + if ( strandCount[0] == 1 && strandCount[1] >= 3 + && strandSupport[1] >= 20 * strandSupport[0] && strandSupport[0] <= 5 ) + { + removeStrand = '-' ; + } + else if ( strandCount[1] == 1 && strandCount[0] >= 3 + && strandSupport[0] >= 20 * strandSupport[1] && strandSupport[1] <= 5 ) + { + removeStrand = '+' ; + } + + if ( removeStrand == 0 ) + { + i = j ; k = to ; + continue ; + } + + for ( k = from ; k < to ; ++k ) + { + if ( sites[k].strand == removeStrand && sites[k].oppositePos >= exonBlocks[i].start && sites[k].oppositePos <= farthest ) + { + //printf( "filtered: %d %d\n", sites[k].pos, sites[k].oppositePos ) ; + sites[k].support = -1 ; + } + } + + i = j ; + k = to ; + } + + k = 0 ; + for ( i = 0 ; i < ssize ; ++i ) + if ( sites[i].support > 0 ) + { + sites[k] = sites[i] ; + ++k ; + } + sites.resize( k ) ; + + delete[] visited ; + delete[] siteToBlock ; + delete[] adj ; + } + + void SplitBlocks( Alignments &alignments, std::vector< struct _splitSite > &splitSites ) + { + std::vector<struct _block> rawExonBlocks = exonBlocks ; + int i, j ; + int tag = 0 ; + int bsize = rawExonBlocks.size() ; + int ssize = splitSites.size() ; + + // Make sure not overflow + struct _splitSite tmp ; + tmp.pos = -1 ; + splitSites.push_back( tmp ) ; + + exonBlocks.clear() ; + for ( i = 0 ; i < bsize ; ++i ) + { + while ( tag < ssize && ( splitSites[tag].chrId < rawExonBlocks[i].chrId || + ( splitSites[tag].chrId == rawExonBlocks[i].chrId && splitSites[tag].pos < rawExonBlocks[i].start ) ) ) + ++tag ; + + int l ; + for ( l = tag ; l < ssize ; ++l ) + if ( splitSites[l].chrId != rawExonBlocks[i].chrId || splitSites[l].pos > rawExonBlocks[i].end ) + break ; + int64_t start = rawExonBlocks[i].start ; + int64_t end = rawExonBlocks[i].end ; + //printf( "%lld %lld\n", start, end ) ; + //printf( "%s %s: %d %d\n", alignments.GetChromName( splitSites[tag].chrId ), alignments.GetChromName( rawExonBlocks[i].chrId ), tag, l ) ; + for ( j = tag ; j <= l ; ++j ) + { + int leftType = 0 ; + int rightType = 0 ; + if ( j > tag ) + { + start = splitSites[j - 1].pos ; // end is from previous stage + leftType = splitSites[j - 1].type ; + } + else + start = rawExonBlocks[i].start ; + + if ( j <= l - 1 ) + { + end = splitSites[j].pos ; + rightType = splitSites[j].type ; + } + else + end = rawExonBlocks[i].end ; + + if ( leftType == 2 ) + ++start ; + if ( rightType == 1 ) + --end ; + + struct _block tmpB ; + tmpB = rawExonBlocks[i] ; + tmpB.start = start ; + tmpB.end = end ; + tmpB.depthSum = 0 ; + tmpB.ratio = 0 ; + //printf( "\t%lld %lld\n", start, end ) ; + tmpB.leftType = leftType ; + tmpB.rightType = rightType ; + tmpB.depth = NULL ; + + exonBlocks.push_back( tmpB ) ; + // handle the soft boundary is the same as the hard boundary case + // or adjacent splice sites + /*if ( j == tag && start == end ) + { + exonBlocks.pop_back() ; + --end ; + } + else if ( j == l && start > end ) + { + exonBlocks.pop_back() ; + }*/ + if ( start > end + //|| ( tmpB.start == rawExonBlocks[i].start && tmpB.end != rawExonBlocks[i].end && tmpB.end - tmpB.start + 1 <= 7 ) + //|| ( tmpB.end == rawExonBlocks[i].end && tmpB.start != rawExonBlocks[i].start && tmpB.end - tmpB.start + 1 <= 7 )) // the case of too short overhang + || ( tmpB.leftType == 0 && tmpB.rightType == 2 && tmpB.end - tmpB.start + 1 <= 9 ) + || ( tmpB.leftType == 1 && tmpB.rightType == 0 && tmpB.end - tmpB.start + 1 <= 9 ) ) + { + exonBlocks.pop_back() ; + } + /*else if ( start == end ) + { + + }*/ + } + } + BuildBlockChrIdOffset() ; + } + + void ComputeDepth( Alignments &alignments ) + { + // Go through the alignment list again to fill in the depthSum; + int i ; + int j ; + int tag = 0 ; + int blockCnt = exonBlocks.size() ; + + std::vector<struct _block> newExonBlocks ; + while ( alignments.Next() ) + { + int segCnt = alignments.segCnt ; + struct _pair *segments = alignments.segments ; + + while ( tag < blockCnt && ( exonBlocks[tag].chrId < alignments.GetChromId() || + ( exonBlocks[tag].chrId == alignments.GetChromId() && exonBlocks[tag].end < segments[0].a ) ) ) + { + AdjustAndCreateExonBlocks( tag, newExonBlocks ) ; + ++tag ; + } + for ( i = 0 ; i < segCnt ; ++i ) + { + for ( j = tag ; j < blockCnt && ( exonBlocks[j].chrId == alignments.GetChromId() && exonBlocks[j].start <= segments[i].b ) ; ++j ) + { + int64_t s = -1, e = -1 ; + exonBlocks[j].depthSum += Overlap( segments[i].a, segments[i].b, exonBlocks[j].start, exonBlocks[j].end, s, e ) ; + if ( s == -1 ) + continue ; + + if ( exonBlocks[j].depth == NULL ) + { + int len = exonBlocks[j].end - exonBlocks[j].start + 1 ; + exonBlocks[j].depth = new int[len + 1] ; + memset( exonBlocks[j].depth, 0, sizeof( int ) * ( len + 1 ) ) ; + exonBlocks[j].leftSplice = exonBlocks[j].start ; + exonBlocks[j].rightSplice = exonBlocks[j].end ; + } + int *depth = exonBlocks[j].depth ; + ++depth[s - exonBlocks[j].start] ; + --depth[e - exonBlocks[j].start + 1] ; // notice the +1 here, since the decrease of coverage actually happens at next base. + + // record the longest alignment stretch support the splice site. + if ( exonBlocks[j].leftType == 1 && segments[i].a == exonBlocks[j].start + && e > exonBlocks[j].leftSplice ) + { + exonBlocks[j].leftSplice = e ; + } + if ( exonBlocks[j].rightType == 2 && segments[i].b == exonBlocks[j].end + && s < exonBlocks[j].rightSplice ) + { + exonBlocks[j].rightSplice = s ; + } + } + } + } + + for ( ; tag < blockCnt ; ++tag ) + AdjustAndCreateExonBlocks( tag, newExonBlocks ) ; + exonBlocks.clear() ; + + // Due to multi-alignment, we may skip some alignments that determines leftSplice and + // rightSplice, hence filtered some subexons. As a result, some subexons' anchor subexon will disapper + // and we need to change its boundary type. + blockCnt = newExonBlocks.size() ; + for ( i = 0 ; i < blockCnt ; ++i ) + { + if ( ( i == 0 && newExonBlocks[i].leftType == 2 ) || + ( i > 0 && newExonBlocks[i].leftType == 2 && newExonBlocks[i - 1].end + 1 != newExonBlocks[i].start ) ) + { + newExonBlocks[i].leftType = 0 ; + } + + if ( ( i == blockCnt - 1 && newExonBlocks[i].rightType == 1 ) || + ( i < blockCnt - 1 && newExonBlocks[i].rightType == 1 && + newExonBlocks[i].end + 1 != newExonBlocks[i + 1].start ) ) + newExonBlocks[i].rightType = 0 ; + } + exonBlocks = newExonBlocks ; + } + + // If two blocks whose soft boundary are close to each other, we can merge them. + void MergeNearBlocks() + { + std::vector<struct _block> rawExonBlocks = exonBlocks ; + int i, k ; + int bsize = rawExonBlocks.size() ; + int *newIdx = new int[bsize] ; // used for adjust prev and next. Note that by merging, it won't change the number of prevCnt or nextCnt. + + exonBlocks.clear() ; + exonBlocks.push_back( rawExonBlocks[0] ) ; + k = 0 ; + newIdx[0] = 0 ; + for ( i = 1 ; i < bsize ; ++i ) + { + if ( rawExonBlocks[i].chrId == exonBlocks[k].chrId + && rawExonBlocks[i].leftType == 0 && exonBlocks[k].rightType == 0 + && rawExonBlocks[i].start - exonBlocks[k].end - 1 <= 50 + && rawExonBlocks[i].depthSum / double( rawExonBlocks[i].end - rawExonBlocks[i].start + 1 ) < 3.0 + && exonBlocks[k].depthSum / double( exonBlocks[k].end - exonBlocks[i].start + 1 ) < 3.0 ) + { + exonBlocks[k].end = rawExonBlocks[i].end ; + exonBlocks[k].rightType = rawExonBlocks[i].rightType ; + exonBlocks[k].rightStrand = rawExonBlocks[i].rightStrand ; + exonBlocks[k].depthSum += rawExonBlocks[i].depthSum ; + if ( rawExonBlocks[i].rightSplice != -1 ) + exonBlocks[k].rightSplice = rawExonBlocks[i].rightSplice ; + + if ( exonBlocks[k].nextCnt > 0 ) + delete[] exonBlocks[k].next ; + exonBlocks[k].nextCnt = rawExonBlocks[i].nextCnt ; + exonBlocks[k].next = rawExonBlocks[i].next ; + } + else + { + exonBlocks.push_back( rawExonBlocks[i] ) ; + ++k ; + } + newIdx[i] = k ; + } + BuildBlockChrIdOffset() ; + bsize = exonBlocks.size() ; + for ( i = 0 ; i < bsize ; ++i ) + { + int cnt = exonBlocks[i].prevCnt ; + for ( k = 0 ; k < cnt ; ++k ) + exonBlocks[i].prev[k] = newIdx[ exonBlocks[i].prev[k] ] ; + + cnt = exonBlocks[i].nextCnt ; + for ( k = 0 ; k < cnt ; ++k ) + exonBlocks[i].next[k] = newIdx[ exonBlocks[i].next[k] ] ; + } + delete[] newIdx ; + } + + void AddIntronInformation( std::vector<struct _splitSite> &sites, Alignments &alignments ) + { + // Add the connection information, support information and the strand information. + int i, j, k, tag ; + tag = 0 ; + int scnt = sites.size() ; + int exonBlockCnt = exonBlocks.size() ; + bool flag ; + + for ( i = 0 ; i < exonBlockCnt ; ++i ) + { + exonBlocks[i].prevCnt = exonBlocks[i].nextCnt = 0 ; + exonBlocks[i].prev = exonBlocks[i].next = NULL ; + exonBlocks[i].leftStrand = exonBlocks[i].rightStrand = '.' ; + exonBlocks[i].leftSplice = exonBlocks[i].rightSplice = 0 ; + } + + // Now, we add the region id and compute the length of each region, here the region does not contain possible IR event. + int regionId = 0 ; + for ( i = 0 ; i < exonBlockCnt ; ) + { + if ( exonBlocks[i].leftType == 2 && exonBlocks[i].rightType == 1 ) + { + j = i + 1 ; + } + else + for ( j = i + 1 ; j < exonBlockCnt ; ++j ) + if ( exonBlocks[j].start > exonBlocks[j - 1].end + 1 || ( exonBlocks[j].leftType == 2 && exonBlocks[j].rightType == 1 ) ) + break ; + for ( k = i ; k < j ; ++k ) + exonBlocks[k].contigId = regionId ; + ++regionId ; + i = j ; + } + int *regionLength = new int[regionId] ; + memset( regionLength, 0, sizeof( int ) * regionId ) ; + for ( i = 0 ; i < exonBlockCnt ; ++i ) + { + regionLength[ exonBlocks[i].contigId ] += exonBlocks[i].end - exonBlocks[i].start + 1 ; + } + + for ( i = 0 ; i < scnt ; ) + { + for ( j = i ; j < scnt ; ++j ) + { + if ( sites[j].chrId != sites[i].chrId || + sites[j].pos != sites[i].pos || + sites[j].type != sites[i].type ) + break ; + } + // [i,j-1] are the indices of the sites with same coordinate + // It is possible a sites corresponds to 2 strands, then we should only keep one of them. + char strand = '.' ; + int strandSupport[2] = {0, 0}; + for ( k = i ; k < j ; ++k ) + { + if ( sites[k].strand == '-' ) + strandSupport[0] += sites[k].support ; + else if ( sites[k].strand == '+' ) + strandSupport[1] += sites[k].support ; + } + if ( strandSupport[0] > strandSupport[1] ) + strand = '-' ; + else if ( strandSupport[1] > strandSupport[0] ) + strand = '+' ; + + + int cnt = j - i ; + // Locate the first subexon that can overlap with this site + while ( tag < exonBlockCnt ) + { + if ( ( exonBlocks[tag].chrId == sites[i].chrId && exonBlocks[tag].end >= sites[i].pos ) + || exonBlocks[tag].chrId > sites[i].chrId ) + break ; + ++tag ; + } + flag = false ; + for ( k = tag; k < exonBlockCnt ; ++k ) + { + if ( exonBlocks[tag].start > sites[i].pos || + exonBlocks[tag].chrId > sites[i].chrId ) + break ; + + if ( exonBlocks[tag].start == sites[i].pos || + exonBlocks[tag].end == sites[i].pos ) + { + flag = true ; + break ; + } + } + + if ( !flag ) + { + i = j ; + continue ; + } + + if ( sites[i].type == 1 && exonBlocks[tag].start == sites[i].pos ) + { + exonBlocks[tag].prevCnt = 0 ; + exonBlocks[tag].prev = new int[cnt] ; + exonBlocks[tag].leftStrand = strand ; + + // And we also need to put the "next" here. + // Here we assume the oppositePos sorted in increasing order + k = tag - 1 ; + for ( int l = j - 1 ; l >= i ; --l ) + { + for ( ; k >= 0 ; --k ) + { + if ( exonBlocks[k].end < sites[l].oppositePos || exonBlocks[k].chrId != sites[l].chrId ) + break ; + if ( exonBlocks[k].end == sites[l].oppositePos ) //&& exonBlocks[k].rightType == sites[l].type ) + { + if ( sites[l].strand != strand || + sites[l].strand != exonBlocks[k].rightStrand ) + break ; + exonBlocks[k].next[ exonBlocks[k].nextCnt ] = tag ; + exonBlocks[tag].prev[ exonBlocks[tag].prevCnt] = k ; // Note that the prev are sorted in decreasing order + ++exonBlocks[k].nextCnt ; + ++exonBlocks[tag].prevCnt ; + double factor = 1.0 ; + if ( regionLength[ exonBlocks[k].contigId ] < alignments.readLen ) + { + int left ; + for ( left = k ; left >= 0 ; --left ) + if ( exonBlocks[left].contigId != exonBlocks[k].contigId ) + break ; + ++left ; + + int prevType = 0 ; + // only consider the portion upto k. + for ( int m = left ; m <= k ; ++m ) + if ( exonBlocks[m].leftType == 1 ) + ++prevType ; + + if ( prevType == 0 ) + factor = 2 * (double)alignments.readLen / regionLength[ exonBlocks[k].contigId ] ; + } + if ( regionLength[ exonBlocks[tag].contigId ] < alignments.readLen ) + { + int right ; + for ( right = tag ; right < exonBlockCnt ; ++right ) + if ( exonBlocks[right].contigId != exonBlocks[tag].contigId ) + break ; + --right ; + + int nextType = 0 ; + for ( int m = tag ; m <= right ; ++m ) + if ( exonBlocks[m].rightType == 2 ) + ++nextType ; + + if ( nextType == 0 ) + factor = 2 * (double)alignments.readLen / regionLength[ exonBlocks[tag].contigId ] ; + } + if ( factor > 10 ) + factor = 10 ; + + if ( exonBlocks[k].contigId != exonBlocks[tag].contigId ) // the support of introns between regions. + { + // Adjust the support if one of the anchor exon is too short. + // This avoid the creation of alternative 3'/5' end due to low coverage from too short anchor. + exonBlocks[tag].leftSplice += sites[l].support * factor ; + exonBlocks[k].rightSplice += sites[l].support * factor ; + } + break ; + } + } + } + } + else if ( sites[i].type == 2 && exonBlocks[tag].end == sites[i].pos ) + { + exonBlocks[tag].nextCnt = 0 ; // cnt ; it should reach cnt after putting the ids in + exonBlocks[tag].next = new int[cnt] ; + exonBlocks[tag].rightStrand = strand ; + } + + i = j ; + } + // Reverse the prev list + for ( i = 0 ; i < exonBlockCnt ; ++i ) + { + for ( j = 0, k = exonBlocks[i].prevCnt - 1 ; j < k ; ++j, --k ) + { + int tmp = exonBlocks[i].prev[j] ; + exonBlocks[i].prev[j] = exonBlocks[i].prev[k] ; + exonBlocks[i].prev[k] = tmp ; + } + } + + // If all of the subexon anchored the intron are filtered, we need to change the type of the other anchor. + for ( i = 0 ; i < exonBlockCnt ; ++i ) + { + if ( exonBlocks[i].leftType == 1 && exonBlocks[i].prevCnt == 0 ) + { + exonBlocks[i].leftType = 0 ; + exonBlocks[i].leftStrand = '.' ; + if ( i > 0 && exonBlocks[i - 1].rightType == 1 ) + exonBlocks[i - 1].rightType = 0 ; + } + if ( exonBlocks[i].rightType == 2 && exonBlocks[i].nextCnt == 0 ) + { + exonBlocks[i].rightType = 0 ; + exonBlocks[i].rightStrand = '.' ; + if ( i < exonBlockCnt - 1 && exonBlocks[i + 1].leftType == 2 ) + exonBlocks[i + 1].leftType = 0 ; + } + } + MergeNearBlocks() ; + + delete[] regionLength ; + } + + double PickLeftAndRightRatio( double l, double r ) + { + if ( l >= 0 && r >= 0 ) + return l < r ? l : r ; + else if ( l < 0 && r < 0 ) + return -1 ; + else if ( l < 0 ) + return r ; + else + return l ; + } + + double PickLeftAndRightRatio( const struct _block &b ) + { + return PickLeftAndRightRatio( b.leftRatio, b.rightRatio ) ; + } + + void ComputeRatios() + { + int i, j ; + int exonBlockCnt = exonBlocks.size() ; + for ( i = 0 ; i < exonBlockCnt ; ++i ) + { + exonBlocks[i].leftRatio = -1 ; + exonBlocks[i].rightRatio = -1 ; + if ( exonBlocks[i].leftType == 2 && exonBlocks[i].rightType == 1 ) + { + // ]...[ + double anchorAvg = 0 ; + double avgDepth = GetAvgDepth( exonBlocks[i] ) ; + if ( i >= 1 && exonBlocks[i - 1].chrId == exonBlocks[i].chrId ) + { + anchorAvg = GetAvgDepth( exonBlocks[i - 1] ) ; + if ( anchorAvg > 1 && avgDepth > 1 ) + exonBlocks[i].leftRatio = ( avgDepth - 1 ) / ( anchorAvg - 1 ) ; + } + if ( i < exonBlockCnt - 1 && exonBlocks[i + 1].chrId == exonBlocks[i].chrId ) + { + anchorAvg = GetAvgDepth( exonBlocks[i + 1] ) ; + if ( anchorAvg > 1 && avgDepth > 1 ) + exonBlocks[i].rightRatio = ( avgDepth - 1 ) / ( anchorAvg - 1 ) ; + } + } + + if ( exonBlocks[i].leftType == 0 && exonBlocks[i].rightType == 1 ) + { + // (..[ , overhang subexon. + double avgDepth = GetAvgDepth( exonBlocks[i] ) ; + double anchorAvg = GetAvgDepth( exonBlocks[i + 1] ) ; + if ( avgDepth > 1 && anchorAvg > 1 ) + exonBlocks[i].rightRatio = ( avgDepth - 1 ) / ( anchorAvg - 1 ) ; + } + + /*if ( exonBlocks[i].leftType == 1 ) + { + // For the case (...[, the leftRatio is actuall the leftratio of the subexon on its right. + int len = 0 ; + double depthSum = 0 ; + int tag = i ; + + for ( j = 0 ; j < exonBlocks[tag].prevCnt ; ++j ) + { + int k = exonBlocks[tag].prev[j] ; + len += ( exonBlocks[k].end - exonBlocks[k].start + 1 ) ; + depthSum += exonBlocks[k].depthSum ; + } + double otherAvgDepth = depthSum / len ; + double avgDepth = GetAvgDepth( exonBlocks[tag] ) ; + + // adjust avgDepth by looking into the following exon blocks(subexon) in the same region whose left side is not hard end + for ( j = tag + 1 ; j < exonBlockCnt ; ++j ) + { + if ( exonBlocks[j].start != exonBlocks[j - 1].end + 1 || exonBlocks[j].leftType == 1 ) + break ; + double tmp = GetAvgDepth( exonBlocks[j] ) ; + if ( tmp > avgDepth ) + avgDepth = tmp ; + } + + if ( avgDepth < 1 ) + avgDepth = 1 ; + if ( otherAvgDepth < 1 ) + otherAvgDepth = 1 ; + + //exonBlocks[i].leftRatio = pow( log( avgDepth ) / log(2.0), 2.0 / 3.0 ) - pow( log( otherAvgDepth ) / log( 2.0 ), 2.0 / 3.0 ); + exonBlocks[i].leftRatio = sqrt( avgDepth ) - log( avgDepth ) - ( sqrt( otherAvgDepth ) + log( otherAvgDepth ) ) ; + + }*/ + if ( exonBlocks[i].rightType == 0 && exonBlocks[i].leftType == 2 ) + { + // for overhang subexon. + double avgDepth = GetAvgDepth( exonBlocks[i] ) ; + double anchorAvg = GetAvgDepth( exonBlocks[i - 1] ) ; + if ( avgDepth > 1 && anchorAvg > 1 ) + exonBlocks[i].leftRatio = ( avgDepth - 1 ) / ( anchorAvg - 1 ) ; + + } + /*if ( exonBlocks[i].rightType == 2 ) + { + int len = 0 ; + double depthSum = 0 ; + int tag = i ; + + for ( j = 0 ; j < exonBlocks[tag].nextCnt ; ++j ) + { + int k = exonBlocks[tag].next[j] ; + len += ( exonBlocks[k].end - exonBlocks[k].start + 1 ) ; + depthSum += exonBlocks[k].depthSum ; + } + double otherAvgDepth = depthSum / len ; + double avgDepth = GetAvgDepth( exonBlocks[tag] ) ; + + // adjust avgDepth by looking into the earlier exon blocks(subexon) in the same region whose right side is not hard end + for ( j = tag - 1 ; j >= 0 ; --j ) + { + if ( exonBlocks[j].end + 1 != exonBlocks[j + 1].start || exonBlocks[j].rightType == 2 ) + break ; + double tmp = GetAvgDepth( exonBlocks[j] ) ; + if ( tmp > avgDepth ) + avgDepth = tmp ; + } + + if ( avgDepth < 1 ) + avgDepth = 1 ; + if ( otherAvgDepth < 1 ) + otherAvgDepth = 1 ; + + exonBlocks[i].rightRatio = sqrt( avgDepth ) - log( avgDepth ) - ( sqrt( otherAvgDepth ) + log( otherAvgDepth ) ) ; + //pow( log( avgDepth ) / log(2.0), 2.0 / 3.0 ) - pow( log( otherAvgDepth ) / log(2.0), 2.0 / 3.0 ); + }*/ + // The remaining case the islands, (...) + } + + // go through each region of subexons, and only compute the ratio for the first and last hard boundary + for ( i = 0 ; i < exonBlockCnt ; ) + { + // the blocks from [i,j) corresponds to a region, namely consecutive subexons. + for ( j = i + 1 ; j < exonBlockCnt ; ++j ) + if ( exonBlocks[j].contigId != exonBlocks[i].contigId ) + break ; + + int leftSupport = 0 ; + int rightSupport = 0 ; + int leftTag = j, rightTag = i - 1 ; + for ( int k = i ; k < j ; ++k ) + { + if ( exonBlocks[k].leftType == 1 && exonBlocks[k].leftSplice > 0 ) + { + leftSupport += exonBlocks[k].leftSplice ; + if ( k < leftTag ) + leftTag = k ; + } + if ( exonBlocks[k].rightType == 2 && exonBlocks[k].rightSplice > 0 ) + { + rightSupport += exonBlocks[k].rightSplice ; + if ( k > rightTag ) + rightTag = k ; + } + } + + if ( leftSupport > 0 && rightSupport > 0 ) + { + // when the right splice support is much greater than that of the left side, there should be a soft boundary for the left side. + // Wether we want to include this soft boundary or not will be determined in class, when it looked at whether the overhang block + // should be kept or not. + exonBlocks[ leftTag ].leftRatio = sqrt( rightSupport ) - log( ( rightSupport + leftSupport ) * 0.5 ) - ( sqrt( leftSupport ) ) ; //+ log( leftSupport ) ) ; + exonBlocks[ rightTag ].rightRatio = sqrt( leftSupport ) - log( ( rightSupport + leftSupport ) * 0.5 ) - ( sqrt( rightSupport ) ) ; //+ log( rightSupport ) ) ; + } + + i = j ; // don't forget this. + } + } +} ; + +#endif