0
|
1 #include <stdio.h>
|
|
2 #include <math.h>
|
|
3 #include <vector>
|
|
4 #include <algorithm>
|
|
5
|
|
6 #include "alignments.hpp"
|
|
7
|
|
8 #define MAX(x, y) (((x)<(y))?(y):(x))
|
|
9 #define MIN(x, y) (((x)<(y))?(x):(y))
|
|
10
|
|
11 char usage[] = "Usage: ./trust-splice splice_file_list one_bam_file [OPTIONS]\n"
|
|
12 "Options:\n"
|
|
13 "\t-a FLOAT: average number of supported reads from the samples (default: 0.5)\n" ;
|
|
14
|
|
15 struct _intron
|
|
16 {
|
|
17 int chrId ;
|
|
18 int start, end ;
|
|
19 double support ;
|
|
20 int uniqSupport ;
|
|
21 int secSupport ;
|
|
22 char strand ;
|
|
23 int editDist ;
|
|
24
|
|
25 int sampleSupport ;
|
|
26 } ;
|
|
27
|
|
28 struct _site
|
|
29 {
|
|
30 int chrId ;
|
|
31 int pos ;
|
|
32 double support ;
|
|
33
|
|
34 char strand ;
|
|
35 int associatedIntronCnt ;
|
|
36 } ;
|
|
37
|
|
38 bool CompIntrons( struct _intron a, struct _intron b )
|
|
39 {
|
|
40 if ( a.chrId != b.chrId )
|
|
41 return a.chrId < b.chrId ;
|
|
42 else if ( a.start != b.start )
|
|
43 return a.start < b.start ;
|
|
44 else if ( a.end != b.end )
|
|
45 return a.end < b.end ;
|
|
46 return false ;
|
|
47 }
|
|
48
|
|
49 bool CompSites( struct _site a, struct _site b )
|
|
50 {
|
|
51 if ( a.chrId != b.chrId )
|
|
52 return a.chrId < b.chrId ;
|
|
53 else if ( a.pos != b.pos )
|
|
54 return a.pos < b.pos ;
|
|
55 return false ;
|
|
56 }
|
|
57
|
|
58 void CoalesceIntrons( std::vector<struct _intron> &introns )
|
|
59 {
|
|
60 std::sort( introns.begin(), introns.end(), CompIntrons ) ;
|
|
61 int size = introns.size() ;
|
|
62 int i, k ;
|
|
63 k = 0 ;
|
|
64 for ( i = 1 ; i < size ; ++i )
|
|
65 {
|
|
66 if ( introns[i].chrId == introns[k].chrId && introns[i].start == introns[k].start
|
|
67 && introns[i].end == introns[k].end )
|
|
68 {
|
|
69 introns[k].support += introns[i].support ;
|
|
70 introns[k].uniqSupport += introns[i].uniqSupport ;
|
|
71 introns[k].secSupport += introns[i].secSupport ;
|
|
72 introns[k].sampleSupport += introns[i].sampleSupport ;
|
|
73
|
|
74 if ( introns[k].strand == '?' )
|
|
75 introns[k].strand = introns[i].strand ;
|
|
76 }
|
|
77 else
|
|
78 {
|
|
79 ++k ;
|
|
80 introns[k] = introns[i] ;
|
|
81 }
|
|
82 }
|
|
83 introns.resize( k + 1 ) ;
|
|
84 }
|
|
85
|
|
86 void CoalesceSites( std::vector<struct _site> &sites )
|
|
87 {
|
|
88 std::sort( sites.begin(), sites.end(), CompSites ) ;
|
|
89 int size = sites.size() ;
|
|
90 int i, k ;
|
|
91 k = 0 ;
|
|
92 for ( i = 1 ; i < size ; ++i )
|
|
93 {
|
|
94 if ( sites[i].chrId == sites[k].chrId && sites[i].pos == sites[k].pos )
|
|
95 {
|
|
96 sites[k].support += sites[i].support ;
|
|
97 sites[k].associatedIntronCnt += sites[i].associatedIntronCnt ;
|
|
98 }
|
|
99 else
|
|
100 {
|
|
101 ++k ;
|
|
102 sites[k] = sites[i] ;
|
|
103 }
|
|
104 }
|
|
105 sites.resize( k + 1 ) ;
|
|
106 }
|
|
107
|
|
108 int main( int argc, char *argv[] )
|
|
109 {
|
|
110 int i, j, k ;
|
|
111 FILE *fpList ;
|
|
112 FILE *fp ;
|
|
113 char spliceFile[2048] ;
|
|
114 char chrName[1024], strand[3] ;
|
|
115 int start, end, uniqSupport, secSupport, uniqEditDistance, secEditDistance ;
|
|
116 double support ;
|
|
117 Alignments alignments ;
|
|
118 std::vector<struct _intron> introns ;
|
|
119 std::vector<struct _site> sites ;
|
|
120 double averageSupportThreshold = 0.5 ;
|
|
121
|
|
122 if ( argc <= 1 )
|
|
123 {
|
|
124 printf( "%s", usage ) ;
|
|
125 exit( 1 ) ;
|
|
126 }
|
|
127
|
|
128 for ( i = 3 ; i < argc ; ++i )
|
|
129 {
|
|
130 if ( !strcmp( argv[ i ], "-a" ) )
|
|
131 {
|
|
132 averageSupportThreshold = atof( argv[i + 1] ) ;
|
|
133 ++i ;
|
|
134 }
|
|
135 else
|
|
136 {
|
|
137 printf( "Unknown option: %s", argv[i] ) ;
|
|
138 exit( 1 ) ;
|
|
139 }
|
|
140 }
|
|
141
|
|
142 alignments.Open( argv[2] ) ;
|
|
143
|
|
144 // Get the number of samples.
|
|
145 fpList = fopen( argv[1], "r" ) ;
|
|
146 int sampleCnt = 0 ;
|
|
147 while ( fscanf( fpList, "%s", spliceFile ) != EOF )
|
|
148 {
|
|
149 ++sampleCnt ;
|
|
150 }
|
|
151 fclose( fpList ) ;
|
|
152
|
|
153 // Get all the introns.
|
|
154 fpList = fopen( argv[1], "r" ) ;
|
|
155 while ( fscanf( fpList, "%s", spliceFile ) != EOF )
|
|
156 {
|
|
157 fp = fopen( spliceFile, "r" ) ;
|
|
158 while ( fscanf( fp, "%s %d %d %lf %s %d %d %d %d", chrName, &start, &end, &support,
|
|
159 strand, &uniqSupport, &secSupport, &uniqEditDistance, &secEditDistance ) != EOF )
|
|
160 {
|
|
161
|
|
162 if ( support <= 0 )
|
|
163 support = 0.1 ;
|
|
164 else if ( support == 1 && sampleCnt > 5 )
|
|
165 support = 0.75 ;
|
|
166
|
|
167 struct _intron ni ;
|
|
168 ni.chrId = alignments.GetChromIdFromName( chrName ) ;
|
|
169 ni.start = start ;
|
|
170 ni.end = end ;
|
|
171 ni.support = support ;
|
|
172 ni.strand = strand[0] ;
|
|
173 ni.uniqSupport = uniqSupport ;
|
|
174 ni.secSupport = secSupport ;
|
|
175 ni.sampleSupport = 1 ;
|
|
176 ni.editDist = uniqEditDistance + secEditDistance ;
|
|
177 introns.push_back( ni ) ;
|
|
178 }
|
|
179 fclose( fp ) ;
|
|
180
|
|
181 CoalesceIntrons( introns ) ;
|
|
182 }
|
|
183 fclose( fpList ) ;
|
|
184
|
|
185 // Obtain the split sites.
|
|
186 int intronCnt = introns.size() ;
|
|
187 for ( i = 0 ; i < intronCnt ; ++i )
|
|
188 {
|
|
189 struct _site ns ;
|
|
190 ns.chrId = introns[i].chrId ;
|
|
191 ns.associatedIntronCnt = 1 ;
|
|
192 ns.support = introns[i].support ;
|
|
193 ns.strand = introns[i].strand ;
|
|
194
|
|
195 ns.pos = introns[i].start ;
|
|
196 sites.push_back( ns ) ;
|
|
197 ns.pos = introns[i].end ;
|
|
198 sites.push_back( ns ) ;
|
|
199 }
|
|
200 CoalesceSites( sites ) ;
|
|
201
|
|
202 // Get the chromsomes with too many split sites.
|
|
203 int siteCnt = sites.size() ;
|
|
204 std::vector<bool> badChrom ;
|
|
205
|
|
206 badChrom.resize( alignments.GetChromCount() ) ;
|
|
207 int size = alignments.GetChromCount() ;
|
|
208 for ( i = 0 ; i < size ; ++i )
|
|
209 badChrom[i] = false ;
|
|
210
|
|
211 for ( i = 0 ; i < siteCnt ; )
|
|
212 {
|
|
213 for ( j = i + 1 ; sites[j].chrId == sites[i].chrId ; ++j )
|
|
214 ;
|
|
215 //printf( "%s %d %d:\n", alignments.GetChromName( sites[i].chrId ), alignments.GetChromLength( sites[i].chrId ), j - i ) ;
|
|
216 if ( ( j - i ) * 20 > alignments.GetChromLength( sites[i].chrId ) )
|
|
217 badChrom[ sites[i].chrId ] = true ;
|
|
218 i = j ;
|
|
219 }
|
|
220
|
|
221 // Output the valid introns.
|
|
222 k = 0 ;
|
|
223 double unit = sampleCnt / 50 ;
|
|
224 if ( unit < 1 )
|
|
225 unit = 1 ;
|
|
226
|
|
227 int longIntronSize ;
|
|
228 std::vector<int> intronSizes ;
|
|
229 for (i = 0 ; i < intronCnt ; ++i)
|
|
230 intronSizes.push_back( introns[i].end - introns[i].start + 1 ) ;
|
|
231 std::sort( intronSizes.begin(), intronSizes.end() ) ;
|
|
232 longIntronSize = intronSizes[ int(intronCnt * 0.99) ] ;
|
|
233 if ( longIntronSize > 100000 )
|
|
234 longIntronSize = 100000 ;
|
|
235 for ( i = 0 ; i < intronCnt ; ++i )
|
|
236 {
|
|
237 if ( introns[i].support / sampleCnt < averageSupportThreshold )
|
|
238 continue ;
|
|
239
|
|
240 if ( badChrom[ introns[i].chrId ] )
|
|
241 {
|
|
242 if ( introns[i].sampleSupport <= sampleCnt / 2 )
|
|
243 continue ;
|
|
244 }
|
|
245
|
|
246 if ( introns[i].uniqSupport <= 2 && ( introns[i].support / sampleCnt < 1 || introns[i].sampleSupport < MIN( 2, sampleCnt ) ) )
|
|
247 continue ;
|
|
248
|
|
249 //Locate the two split sites.
|
|
250 while ( sites[k].chrId < introns[i].chrId || ( sites[k].chrId == introns[i].chrId && sites[k].pos < introns[i].start ) )
|
|
251 {
|
|
252 ++k ;
|
|
253 }
|
|
254 int a, b ;
|
|
255 a = k ;
|
|
256 for ( b = a + 1 ; b < siteCnt ; ++b )
|
|
257 {
|
|
258 if ( sites[b].chrId == introns[i].chrId && sites[b].pos == introns[i].end )
|
|
259 break ;
|
|
260 }
|
|
261 double siteSupport = MAX( sites[a].support, sites[b].support ) ;
|
|
262 //if ( introns[i].start == 100233371 && introns[i].end == 100236850 )
|
|
263 // printf( "test: %lf %lf\n", introns[i].support, siteSupport) ;
|
|
264 if ( introns[i].support < siteSupport / 10.0 )
|
|
265 {
|
|
266 double needSample = MIN( ( -log( introns[i].support / siteSupport ) / log( 10.0 ) + 1 ) * unit, sampleCnt ) ;
|
|
267 if ( introns[i].sampleSupport < needSample )
|
|
268 continue ;
|
|
269 }
|
|
270
|
|
271 if ( sampleCnt >= 100 ) //&& introns[i].end - introns[i].start + 1 >= 50000 )
|
|
272 {
|
|
273 if ( introns[i].sampleSupport <= sampleCnt * 0.01 )
|
|
274 continue ;
|
|
275 }
|
|
276 /*if ( sampleCnt >= 50 )
|
|
277 {
|
|
278 // just some randomly intron.
|
|
279 if ( introns[i].sampleSupport == 1
|
|
280 && ( sites[a].associatedIntronCnt == 1 || sites[b].associatedIntronCnt == 1 ) )
|
|
281 continue ;
|
|
282 }*/
|
|
283
|
|
284 /*if ( introns[i].end - introns[i].start + 1 < 50 )
|
|
285 {
|
|
286 int needSample = MIN( ( 5 - ( introns[i].end - introns[i].start + 1 ) / 10 ) * unit, sampleCnt ) ;
|
|
287 int flag = 0 ;
|
|
288
|
|
289 if ( introns[i].sampleSupport < needSample )
|
|
290 flag = 1 ;
|
|
291 if ( flag == 1 )
|
|
292 continue ;
|
|
293 }*/
|
|
294 if ( introns[i].strand == '?' && sampleCnt == 1 &&
|
|
295 ( introns[i].support < 5 || introns[i].uniqSupport == 0 || introns[i].support < 2 * introns[i].editDist ) )
|
|
296 continue ;
|
|
297
|
|
298 // Since the strand is uncertain, the alinger may make different decision sample from sample.
|
|
299 // To keep this intron, one of its splice sites must be more supported than adjacent splice sites.
|
|
300 if ( introns[i].strand == '?' && introns[i].sampleSupport <= 0.5 * sampleCnt )
|
|
301 {
|
|
302 int s, e ;
|
|
303 int l ;
|
|
304 int cnt = 0 ;
|
|
305 for ( l = 0 ; l < 2 ; ++l )
|
|
306 {
|
|
307 int ind = ( l == 0 ) ? a : b ;
|
|
308 double max = sites[ind].support ;
|
|
309 int maxTag = ind ;
|
|
310 for ( s = ind - 1 ; s >= 0 && sites[s].chrId == sites[ind].chrId ; --s )
|
|
311 {
|
|
312 if ( sites[s].pos + 7 < sites[s + 1].pos )
|
|
313 break ;
|
|
314 if ( sites[s].support >= max )
|
|
315 {
|
|
316 max = sites[s].support ;
|
|
317 maxTag = s ;
|
|
318 }
|
|
319 }
|
|
320
|
|
321 for ( e = ind + 1 ; e < siteCnt && sites[e].chrId == sites[ind].chrId ; ++e )
|
|
322 {
|
|
323 if ( sites[e].pos - 7 > sites[e - 1].pos )
|
|
324 break ;
|
|
325 if ( sites[e].support >= max )
|
|
326 {
|
|
327 max = sites[e].support ;
|
|
328 maxTag = e ;
|
|
329 }
|
|
330 }
|
|
331
|
|
332 if ( maxTag == ind )
|
|
333 ++cnt ;
|
|
334 }
|
|
335 if ( cnt == 0 )
|
|
336 continue ;
|
|
337 }
|
|
338
|
|
339 // Test whether this a intron coming from a wrong strand
|
|
340 /*if ( b - a + 1 >= 10 && introns[i].strand != '?' && introns[i].sampleSupport <= 0.5 * sampleCnt )
|
|
341 {
|
|
342 int plusStrand = 0 ;
|
|
343 int minusStrand = 0 ;
|
|
344 int l ;
|
|
345 int s, e ;
|
|
346
|
|
347 if ( introns[i].strand == '+' )
|
|
348 plusStrand = 2 ;
|
|
349 else
|
|
350 minusStrand = 2 ;
|
|
351
|
|
352 for ( l = 0 ; l < 2 ; ++l )
|
|
353 {
|
|
354 int ind = ( l == 0 ) ? a : b ;
|
|
355 for ( s = ind - 1 ; s >= 0 && sites[s].chrId == sites[ind].chrId ; --s )
|
|
356 {
|
|
357 if ( sites[s].pos + 10000 < sites[s + 1].pos )
|
|
358 break ;
|
|
359
|
|
360 if ( sites[s].strand == '+' )
|
|
361 ++plusStrand ;
|
|
362 else if ( sites[s].strand == '-' )
|
|
363 ++minusStrand ;
|
|
364 }
|
|
365
|
|
366 for ( e = ind + 1 ; e < siteCnt && sites[e].chrId == sites[ind].chrId ; ++e )
|
|
367 {
|
|
368 if ( sites[e].pos - 10000 > sites[e - 1].pos )
|
|
369 break ;
|
|
370
|
|
371 if ( sites[e].strand == '+' )
|
|
372 ++plusStrand ;
|
|
373 else if ( sites[e].strand == '-' )
|
|
374 ++minusStrand ;
|
|
375 }
|
|
376 }
|
|
377
|
|
378 if ( introns[i].start == 161517978 )
|
|
379 printf( "capture: %d %d %d %d\n", a, b, minusStrand, plusStrand) ;
|
|
380
|
|
381 if ( introns[i].strand == '+' && minusStrand >= 20 && plusStrand == 2 )
|
|
382 continue ;
|
|
383 else if ( introns[i].strand == '-' && plusStrand >= 20 && minusStrand == 2 )
|
|
384 continue ;
|
|
385
|
|
386 }*/
|
|
387
|
|
388 // Filter a intron if one of its splice site associated with too many introns.
|
|
389 /*if ( sites[a].associatedIntronCnt >= 10 )
|
|
390 {
|
|
391 int needSample = MIN( sites[a].associatedIntronCnt / 100 * sampleCnt, sampleCnt ) ;
|
|
392 if ( introns[i].support < sites[a].support / 100 &&
|
|
393 introns[i].sampleSupport < needSample )
|
|
394 {
|
|
395 continue ;
|
|
396 }
|
|
397 }
|
|
398 if ( sites[b].associatedIntronCnt >= 10 )
|
|
399 {
|
|
400 int needSample = MIN( sites[b].associatedIntronCnt / 100 * sampleCnt, sampleCnt ) ;
|
|
401 if ( introns[i].support < sites[b].support / 100 &&
|
|
402 introns[i].sampleSupport < needSample )
|
|
403 {
|
|
404 continue ;
|
|
405 }
|
|
406 }*/
|
|
407
|
|
408
|
|
409 // Test for long intron
|
|
410 if ( introns[i].end - introns[i].start + 1 >= longIntronSize )
|
|
411 {
|
|
412 int needSample = MIN( ( ( introns[i].end - introns[i].start + 1 ) / longIntronSize + 1 ) * unit, sampleCnt ) ;
|
|
413 int flag = 0 ;
|
|
414 if ( (double)introns[i].uniqSupport / ( introns[i].uniqSupport + introns[i].secSupport ) < 0.1
|
|
415 || introns[i].uniqSupport / sampleCnt < 1
|
|
416 || introns[i].sampleSupport < needSample )
|
|
417 flag = 1 ;
|
|
418 if ( flag == 1 && introns[i].end - introns[i].start + 1 >= 3 * longIntronSize )
|
|
419 continue ;
|
|
420 if ( flag == 1 && ( sites[a].associatedIntronCnt > 1 || sites[b].associatedIntronCnt > 1 || introns[i].sampleSupport <= 1 ) ) // an intron may connect two genes
|
|
421 continue ;
|
|
422 }
|
|
423
|
|
424
|
|
425 printf( "%s %d %d 10 %c 10 0 0 0\n", alignments.GetChromName( introns[i].chrId ), introns[i].start, introns[i].end, introns[i].strand ) ;
|
|
426 }
|
|
427
|
|
428 return 0 ;
|
|
429 }
|