annotate PsiCLASS-1.0.2/GetTrustedSplice.cpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1 #include <stdio.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
2 #include <math.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
3 #include <vector>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
4 #include <algorithm>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
5
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
6 #include "alignments.hpp"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
7
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
8 #define MAX(x, y) (((x)<(y))?(y):(x))
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
9 #define MIN(x, y) (((x)<(y))?(x):(y))
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
10
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
11 char usage[] = "Usage: ./trust-splice splice_file_list one_bam_file [OPTIONS]\n"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
12 "Options:\n"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
13 "\t-a FLOAT: average number of supported reads from the samples (default: 0.5)\n" ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
14
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
15 struct _intron
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
16 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
17 int chrId ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
18 int start, end ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
19 double support ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
20 int uniqSupport ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
21 int secSupport ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
22 char strand ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
23 int editDist ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
24
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
25 int sampleSupport ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
26 } ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
27
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
28 struct _site
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
29 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
30 int chrId ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
31 int pos ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
32 double support ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
33
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
34 char strand ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
35 int associatedIntronCnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
36 } ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
37
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
38 bool CompIntrons( struct _intron a, struct _intron b )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
39 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
40 if ( a.chrId != b.chrId )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
41 return a.chrId < b.chrId ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
42 else if ( a.start != b.start )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
43 return a.start < b.start ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
44 else if ( a.end != b.end )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
45 return a.end < b.end ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
46 return false ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
47 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
48
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
49 bool CompSites( struct _site a, struct _site b )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
50 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
51 if ( a.chrId != b.chrId )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
52 return a.chrId < b.chrId ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
53 else if ( a.pos != b.pos )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
54 return a.pos < b.pos ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
55 return false ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
56 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
57
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
58 void CoalesceIntrons( std::vector<struct _intron> &introns )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
59 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
60 std::sort( introns.begin(), introns.end(), CompIntrons ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
61 int size = introns.size() ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
62 int i, k ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
63 k = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
64 for ( i = 1 ; i < size ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
65 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
66 if ( introns[i].chrId == introns[k].chrId && introns[i].start == introns[k].start
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
67 && introns[i].end == introns[k].end )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
68 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
69 introns[k].support += introns[i].support ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
70 introns[k].uniqSupport += introns[i].uniqSupport ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
71 introns[k].secSupport += introns[i].secSupport ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
72 introns[k].sampleSupport += introns[i].sampleSupport ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
73
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
74 if ( introns[k].strand == '?' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
75 introns[k].strand = introns[i].strand ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
76 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
77 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
78 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
79 ++k ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
80 introns[k] = introns[i] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
81 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
82 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
83 introns.resize( k + 1 ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
84 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
85
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
86 void CoalesceSites( std::vector<struct _site> &sites )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
87 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
88 std::sort( sites.begin(), sites.end(), CompSites ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
89 int size = sites.size() ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
90 int i, k ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
91 k = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
92 for ( i = 1 ; i < size ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
93 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
94 if ( sites[i].chrId == sites[k].chrId && sites[i].pos == sites[k].pos )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
95 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
96 sites[k].support += sites[i].support ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
97 sites[k].associatedIntronCnt += sites[i].associatedIntronCnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
98 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
99 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
100 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
101 ++k ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
102 sites[k] = sites[i] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
103 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
104 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
105 sites.resize( k + 1 ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
106 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
107
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
108 int main( int argc, char *argv[] )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
109 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
110 int i, j, k ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
111 FILE *fpList ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
112 FILE *fp ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
113 char spliceFile[2048] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
114 char chrName[1024], strand[3] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
115 int start, end, uniqSupport, secSupport, uniqEditDistance, secEditDistance ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
116 double support ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
117 Alignments alignments ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
118 std::vector<struct _intron> introns ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
119 std::vector<struct _site> sites ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
120 double averageSupportThreshold = 0.5 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
121
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
122 if ( argc <= 1 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
123 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
124 printf( "%s", usage ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
125 exit( 1 ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
126 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
127
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
128 for ( i = 3 ; i < argc ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
129 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
130 if ( !strcmp( argv[ i ], "-a" ) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
131 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
132 averageSupportThreshold = atof( argv[i + 1] ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
133 ++i ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
134 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
135 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
136 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
137 printf( "Unknown option: %s", argv[i] ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
138 exit( 1 ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
139 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
140 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
141
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
142 alignments.Open( argv[2] ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
143
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
144 // Get the number of samples.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
145 fpList = fopen( argv[1], "r" ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
146 int sampleCnt = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
147 while ( fscanf( fpList, "%s", spliceFile ) != EOF )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
148 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
149 ++sampleCnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
150 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
151 fclose( fpList ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
152
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
153 // Get all the introns.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
154 fpList = fopen( argv[1], "r" ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
155 while ( fscanf( fpList, "%s", spliceFile ) != EOF )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
156 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
157 fp = fopen( spliceFile, "r" ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
158 while ( fscanf( fp, "%s %d %d %lf %s %d %d %d %d", chrName, &start, &end, &support,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
159 strand, &uniqSupport, &secSupport, &uniqEditDistance, &secEditDistance ) != EOF )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
160 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
161
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
162 if ( support <= 0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
163 support = 0.1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
164 else if ( support == 1 && sampleCnt > 5 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
165 support = 0.75 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
166
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
167 struct _intron ni ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
168 ni.chrId = alignments.GetChromIdFromName( chrName ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
169 ni.start = start ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
170 ni.end = end ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
171 ni.support = support ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
172 ni.strand = strand[0] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
173 ni.uniqSupport = uniqSupport ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
174 ni.secSupport = secSupport ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
175 ni.sampleSupport = 1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
176 ni.editDist = uniqEditDistance + secEditDistance ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
177 introns.push_back( ni ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
178 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
179 fclose( fp ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
180
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
181 CoalesceIntrons( introns ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
182 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
183 fclose( fpList ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
184
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
185 // Obtain the split sites.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
186 int intronCnt = introns.size() ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
187 for ( i = 0 ; i < intronCnt ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
188 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
189 struct _site ns ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
190 ns.chrId = introns[i].chrId ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
191 ns.associatedIntronCnt = 1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
192 ns.support = introns[i].support ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
193 ns.strand = introns[i].strand ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
194
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
195 ns.pos = introns[i].start ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
196 sites.push_back( ns ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
197 ns.pos = introns[i].end ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
198 sites.push_back( ns ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
199 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
200 CoalesceSites( sites ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
201
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
202 // Get the chromsomes with too many split sites.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
203 int siteCnt = sites.size() ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
204 std::vector<bool> badChrom ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
205
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
206 badChrom.resize( alignments.GetChromCount() ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
207 int size = alignments.GetChromCount() ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
208 for ( i = 0 ; i < size ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
209 badChrom[i] = false ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
210
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
211 for ( i = 0 ; i < siteCnt ; )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
212 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
213 for ( j = i + 1 ; sites[j].chrId == sites[i].chrId ; ++j )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
214 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
215 //printf( "%s %d %d:\n", alignments.GetChromName( sites[i].chrId ), alignments.GetChromLength( sites[i].chrId ), j - i ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
216 if ( ( j - i ) * 20 > alignments.GetChromLength( sites[i].chrId ) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
217 badChrom[ sites[i].chrId ] = true ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
218 i = j ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
219 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
220
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
221 // Output the valid introns.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
222 k = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
223 double unit = sampleCnt / 50 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
224 if ( unit < 1 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
225 unit = 1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
226
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
227 int longIntronSize ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
228 std::vector<int> intronSizes ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
229 for (i = 0 ; i < intronCnt ; ++i)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
230 intronSizes.push_back( introns[i].end - introns[i].start + 1 ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
231 std::sort( intronSizes.begin(), intronSizes.end() ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
232 longIntronSize = intronSizes[ int(intronCnt * 0.99) ] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
233 if ( longIntronSize > 100000 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
234 longIntronSize = 100000 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
235 for ( i = 0 ; i < intronCnt ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
236 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
237 if ( introns[i].support / sampleCnt < averageSupportThreshold )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
238 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
239
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
240 if ( badChrom[ introns[i].chrId ] )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
241 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
242 if ( introns[i].sampleSupport <= sampleCnt / 2 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
243 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
244 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
245
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
246 if ( introns[i].uniqSupport <= 2 && ( introns[i].support / sampleCnt < 1 || introns[i].sampleSupport < MIN( 2, sampleCnt ) ) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
247 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
248
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
249 //Locate the two split sites.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
250 while ( sites[k].chrId < introns[i].chrId || ( sites[k].chrId == introns[i].chrId && sites[k].pos < introns[i].start ) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
251 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
252 ++k ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
253 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
254 int a, b ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
255 a = k ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
256 for ( b = a + 1 ; b < siteCnt ; ++b )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
257 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
258 if ( sites[b].chrId == introns[i].chrId && sites[b].pos == introns[i].end )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
259 break ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
260 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
261 double siteSupport = MAX( sites[a].support, sites[b].support ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
262 //if ( introns[i].start == 100233371 && introns[i].end == 100236850 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
263 // printf( "test: %lf %lf\n", introns[i].support, siteSupport) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
264 if ( introns[i].support < siteSupport / 10.0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
265 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
266 double needSample = MIN( ( -log( introns[i].support / siteSupport ) / log( 10.0 ) + 1 ) * unit, sampleCnt ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
267 if ( introns[i].sampleSupport < needSample )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
268 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
269 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
270
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
271 if ( sampleCnt >= 100 ) //&& introns[i].end - introns[i].start + 1 >= 50000 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
272 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
273 if ( introns[i].sampleSupport <= sampleCnt * 0.01 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
274 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
275 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
276 /*if ( sampleCnt >= 50 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
277 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
278 // just some randomly intron.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
279 if ( introns[i].sampleSupport == 1
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
280 && ( sites[a].associatedIntronCnt == 1 || sites[b].associatedIntronCnt == 1 ) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
281 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
282 }*/
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
283
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
284 /*if ( introns[i].end - introns[i].start + 1 < 50 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
285 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
286 int needSample = MIN( ( 5 - ( introns[i].end - introns[i].start + 1 ) / 10 ) * unit, sampleCnt ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
287 int flag = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
288
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
289 if ( introns[i].sampleSupport < needSample )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
290 flag = 1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
291 if ( flag == 1 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
292 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
293 }*/
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
294 if ( introns[i].strand == '?' && sampleCnt == 1 &&
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
295 ( introns[i].support < 5 || introns[i].uniqSupport == 0 || introns[i].support < 2 * introns[i].editDist ) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
296 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
297
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
298 // Since the strand is uncertain, the alinger may make different decision sample from sample.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
299 // To keep this intron, one of its splice sites must be more supported than adjacent splice sites.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
300 if ( introns[i].strand == '?' && introns[i].sampleSupport <= 0.5 * sampleCnt )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
301 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
302 int s, e ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
303 int l ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
304 int cnt = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
305 for ( l = 0 ; l < 2 ; ++l )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
306 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
307 int ind = ( l == 0 ) ? a : b ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
308 double max = sites[ind].support ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
309 int maxTag = ind ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
310 for ( s = ind - 1 ; s >= 0 && sites[s].chrId == sites[ind].chrId ; --s )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
311 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
312 if ( sites[s].pos + 7 < sites[s + 1].pos )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
313 break ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
314 if ( sites[s].support >= max )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
315 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
316 max = sites[s].support ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
317 maxTag = s ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
318 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
319 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
320
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
321 for ( e = ind + 1 ; e < siteCnt && sites[e].chrId == sites[ind].chrId ; ++e )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
322 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
323 if ( sites[e].pos - 7 > sites[e - 1].pos )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
324 break ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
325 if ( sites[e].support >= max )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
326 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
327 max = sites[e].support ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
328 maxTag = e ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
329 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
330 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
331
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
332 if ( maxTag == ind )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
333 ++cnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
334 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
335 if ( cnt == 0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
336 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
337 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
338
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
339 // Test whether this a intron coming from a wrong strand
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
340 /*if ( b - a + 1 >= 10 && introns[i].strand != '?' && introns[i].sampleSupport <= 0.5 * sampleCnt )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
341 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
342 int plusStrand = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
343 int minusStrand = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
344 int l ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
345 int s, e ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
346
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
347 if ( introns[i].strand == '+' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
348 plusStrand = 2 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
349 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
350 minusStrand = 2 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
351
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
352 for ( l = 0 ; l < 2 ; ++l )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
353 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
354 int ind = ( l == 0 ) ? a : b ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
355 for ( s = ind - 1 ; s >= 0 && sites[s].chrId == sites[ind].chrId ; --s )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
356 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
357 if ( sites[s].pos + 10000 < sites[s + 1].pos )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
358 break ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
359
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
360 if ( sites[s].strand == '+' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
361 ++plusStrand ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
362 else if ( sites[s].strand == '-' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
363 ++minusStrand ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
364 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
365
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
366 for ( e = ind + 1 ; e < siteCnt && sites[e].chrId == sites[ind].chrId ; ++e )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
367 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
368 if ( sites[e].pos - 10000 > sites[e - 1].pos )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
369 break ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
370
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
371 if ( sites[e].strand == '+' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
372 ++plusStrand ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
373 else if ( sites[e].strand == '-' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
374 ++minusStrand ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
375 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
376 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
377
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
378 if ( introns[i].start == 161517978 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
379 printf( "capture: %d %d %d %d\n", a, b, minusStrand, plusStrand) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
380
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
381 if ( introns[i].strand == '+' && minusStrand >= 20 && plusStrand == 2 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
382 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
383 else if ( introns[i].strand == '-' && plusStrand >= 20 && minusStrand == 2 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
384 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
385
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
386 }*/
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
387
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
388 // Filter a intron if one of its splice site associated with too many introns.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
389 /*if ( sites[a].associatedIntronCnt >= 10 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
390 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
391 int needSample = MIN( sites[a].associatedIntronCnt / 100 * sampleCnt, sampleCnt ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
392 if ( introns[i].support < sites[a].support / 100 &&
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
393 introns[i].sampleSupport < needSample )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
394 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
395 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
396 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
397 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
398 if ( sites[b].associatedIntronCnt >= 10 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
399 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
400 int needSample = MIN( sites[b].associatedIntronCnt / 100 * sampleCnt, sampleCnt ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
401 if ( introns[i].support < sites[b].support / 100 &&
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
402 introns[i].sampleSupport < needSample )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
403 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
404 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
405 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
406 }*/
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
407
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
408
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
409 // Test for long intron
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
410 if ( introns[i].end - introns[i].start + 1 >= longIntronSize )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
411 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
412 int needSample = MIN( ( ( introns[i].end - introns[i].start + 1 ) / longIntronSize + 1 ) * unit, sampleCnt ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
413 int flag = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
414 if ( (double)introns[i].uniqSupport / ( introns[i].uniqSupport + introns[i].secSupport ) < 0.1
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
415 || introns[i].uniqSupport / sampleCnt < 1
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
416 || introns[i].sampleSupport < needSample )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
417 flag = 1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
418 if ( flag == 1 && introns[i].end - introns[i].start + 1 >= 3 * longIntronSize )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
419 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
420 if ( flag == 1 && ( sites[a].associatedIntronCnt > 1 || sites[b].associatedIntronCnt > 1 || introns[i].sampleSupport <= 1 ) ) // an intron may connect two genes
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
421 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
422 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
423
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
424
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
425 printf( "%s %d %d 10 %c 10 0 0 0\n", alignments.GetChromName( introns[i].chrId ), introns[i].start, introns[i].end, introns[i].strand ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
426 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
427
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
428 return 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
429 }