comparison PsiCLASS-1.0.2/GetTrustedSplice.cpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 #include <stdio.h>
2 #include <math.h>
3 #include <vector>
4 #include <algorithm>
5
6 #include "alignments.hpp"
7
8 #define MAX(x, y) (((x)<(y))?(y):(x))
9 #define MIN(x, y) (((x)<(y))?(x):(y))
10
11 char usage[] = "Usage: ./trust-splice splice_file_list one_bam_file [OPTIONS]\n"
12 "Options:\n"
13 "\t-a FLOAT: average number of supported reads from the samples (default: 0.5)\n" ;
14
15 struct _intron
16 {
17 int chrId ;
18 int start, end ;
19 double support ;
20 int uniqSupport ;
21 int secSupport ;
22 char strand ;
23 int editDist ;
24
25 int sampleSupport ;
26 } ;
27
28 struct _site
29 {
30 int chrId ;
31 int pos ;
32 double support ;
33
34 char strand ;
35 int associatedIntronCnt ;
36 } ;
37
38 bool CompIntrons( struct _intron a, struct _intron b )
39 {
40 if ( a.chrId != b.chrId )
41 return a.chrId < b.chrId ;
42 else if ( a.start != b.start )
43 return a.start < b.start ;
44 else if ( a.end != b.end )
45 return a.end < b.end ;
46 return false ;
47 }
48
49 bool CompSites( struct _site a, struct _site b )
50 {
51 if ( a.chrId != b.chrId )
52 return a.chrId < b.chrId ;
53 else if ( a.pos != b.pos )
54 return a.pos < b.pos ;
55 return false ;
56 }
57
58 void CoalesceIntrons( std::vector<struct _intron> &introns )
59 {
60 std::sort( introns.begin(), introns.end(), CompIntrons ) ;
61 int size = introns.size() ;
62 int i, k ;
63 k = 0 ;
64 for ( i = 1 ; i < size ; ++i )
65 {
66 if ( introns[i].chrId == introns[k].chrId && introns[i].start == introns[k].start
67 && introns[i].end == introns[k].end )
68 {
69 introns[k].support += introns[i].support ;
70 introns[k].uniqSupport += introns[i].uniqSupport ;
71 introns[k].secSupport += introns[i].secSupport ;
72 introns[k].sampleSupport += introns[i].sampleSupport ;
73
74 if ( introns[k].strand == '?' )
75 introns[k].strand = introns[i].strand ;
76 }
77 else
78 {
79 ++k ;
80 introns[k] = introns[i] ;
81 }
82 }
83 introns.resize( k + 1 ) ;
84 }
85
86 void CoalesceSites( std::vector<struct _site> &sites )
87 {
88 std::sort( sites.begin(), sites.end(), CompSites ) ;
89 int size = sites.size() ;
90 int i, k ;
91 k = 0 ;
92 for ( i = 1 ; i < size ; ++i )
93 {
94 if ( sites[i].chrId == sites[k].chrId && sites[i].pos == sites[k].pos )
95 {
96 sites[k].support += sites[i].support ;
97 sites[k].associatedIntronCnt += sites[i].associatedIntronCnt ;
98 }
99 else
100 {
101 ++k ;
102 sites[k] = sites[i] ;
103 }
104 }
105 sites.resize( k + 1 ) ;
106 }
107
108 int main( int argc, char *argv[] )
109 {
110 int i, j, k ;
111 FILE *fpList ;
112 FILE *fp ;
113 char spliceFile[2048] ;
114 char chrName[1024], strand[3] ;
115 int start, end, uniqSupport, secSupport, uniqEditDistance, secEditDistance ;
116 double support ;
117 Alignments alignments ;
118 std::vector<struct _intron> introns ;
119 std::vector<struct _site> sites ;
120 double averageSupportThreshold = 0.5 ;
121
122 if ( argc <= 1 )
123 {
124 printf( "%s", usage ) ;
125 exit( 1 ) ;
126 }
127
128 for ( i = 3 ; i < argc ; ++i )
129 {
130 if ( !strcmp( argv[ i ], "-a" ) )
131 {
132 averageSupportThreshold = atof( argv[i + 1] ) ;
133 ++i ;
134 }
135 else
136 {
137 printf( "Unknown option: %s", argv[i] ) ;
138 exit( 1 ) ;
139 }
140 }
141
142 alignments.Open( argv[2] ) ;
143
144 // Get the number of samples.
145 fpList = fopen( argv[1], "r" ) ;
146 int sampleCnt = 0 ;
147 while ( fscanf( fpList, "%s", spliceFile ) != EOF )
148 {
149 ++sampleCnt ;
150 }
151 fclose( fpList ) ;
152
153 // Get all the introns.
154 fpList = fopen( argv[1], "r" ) ;
155 while ( fscanf( fpList, "%s", spliceFile ) != EOF )
156 {
157 fp = fopen( spliceFile, "r" ) ;
158 while ( fscanf( fp, "%s %d %d %lf %s %d %d %d %d", chrName, &start, &end, &support,
159 strand, &uniqSupport, &secSupport, &uniqEditDistance, &secEditDistance ) != EOF )
160 {
161
162 if ( support <= 0 )
163 support = 0.1 ;
164 else if ( support == 1 && sampleCnt > 5 )
165 support = 0.75 ;
166
167 struct _intron ni ;
168 ni.chrId = alignments.GetChromIdFromName( chrName ) ;
169 ni.start = start ;
170 ni.end = end ;
171 ni.support = support ;
172 ni.strand = strand[0] ;
173 ni.uniqSupport = uniqSupport ;
174 ni.secSupport = secSupport ;
175 ni.sampleSupport = 1 ;
176 ni.editDist = uniqEditDistance + secEditDistance ;
177 introns.push_back( ni ) ;
178 }
179 fclose( fp ) ;
180
181 CoalesceIntrons( introns ) ;
182 }
183 fclose( fpList ) ;
184
185 // Obtain the split sites.
186 int intronCnt = introns.size() ;
187 for ( i = 0 ; i < intronCnt ; ++i )
188 {
189 struct _site ns ;
190 ns.chrId = introns[i].chrId ;
191 ns.associatedIntronCnt = 1 ;
192 ns.support = introns[i].support ;
193 ns.strand = introns[i].strand ;
194
195 ns.pos = introns[i].start ;
196 sites.push_back( ns ) ;
197 ns.pos = introns[i].end ;
198 sites.push_back( ns ) ;
199 }
200 CoalesceSites( sites ) ;
201
202 // Get the chromsomes with too many split sites.
203 int siteCnt = sites.size() ;
204 std::vector<bool> badChrom ;
205
206 badChrom.resize( alignments.GetChromCount() ) ;
207 int size = alignments.GetChromCount() ;
208 for ( i = 0 ; i < size ; ++i )
209 badChrom[i] = false ;
210
211 for ( i = 0 ; i < siteCnt ; )
212 {
213 for ( j = i + 1 ; sites[j].chrId == sites[i].chrId ; ++j )
214 ;
215 //printf( "%s %d %d:\n", alignments.GetChromName( sites[i].chrId ), alignments.GetChromLength( sites[i].chrId ), j - i ) ;
216 if ( ( j - i ) * 20 > alignments.GetChromLength( sites[i].chrId ) )
217 badChrom[ sites[i].chrId ] = true ;
218 i = j ;
219 }
220
221 // Output the valid introns.
222 k = 0 ;
223 double unit = sampleCnt / 50 ;
224 if ( unit < 1 )
225 unit = 1 ;
226
227 int longIntronSize ;
228 std::vector<int> intronSizes ;
229 for (i = 0 ; i < intronCnt ; ++i)
230 intronSizes.push_back( introns[i].end - introns[i].start + 1 ) ;
231 std::sort( intronSizes.begin(), intronSizes.end() ) ;
232 longIntronSize = intronSizes[ int(intronCnt * 0.99) ] ;
233 if ( longIntronSize > 100000 )
234 longIntronSize = 100000 ;
235 for ( i = 0 ; i < intronCnt ; ++i )
236 {
237 if ( introns[i].support / sampleCnt < averageSupportThreshold )
238 continue ;
239
240 if ( badChrom[ introns[i].chrId ] )
241 {
242 if ( introns[i].sampleSupport <= sampleCnt / 2 )
243 continue ;
244 }
245
246 if ( introns[i].uniqSupport <= 2 && ( introns[i].support / sampleCnt < 1 || introns[i].sampleSupport < MIN( 2, sampleCnt ) ) )
247 continue ;
248
249 //Locate the two split sites.
250 while ( sites[k].chrId < introns[i].chrId || ( sites[k].chrId == introns[i].chrId && sites[k].pos < introns[i].start ) )
251 {
252 ++k ;
253 }
254 int a, b ;
255 a = k ;
256 for ( b = a + 1 ; b < siteCnt ; ++b )
257 {
258 if ( sites[b].chrId == introns[i].chrId && sites[b].pos == introns[i].end )
259 break ;
260 }
261 double siteSupport = MAX( sites[a].support, sites[b].support ) ;
262 //if ( introns[i].start == 100233371 && introns[i].end == 100236850 )
263 // printf( "test: %lf %lf\n", introns[i].support, siteSupport) ;
264 if ( introns[i].support < siteSupport / 10.0 )
265 {
266 double needSample = MIN( ( -log( introns[i].support / siteSupport ) / log( 10.0 ) + 1 ) * unit, sampleCnt ) ;
267 if ( introns[i].sampleSupport < needSample )
268 continue ;
269 }
270
271 if ( sampleCnt >= 100 ) //&& introns[i].end - introns[i].start + 1 >= 50000 )
272 {
273 if ( introns[i].sampleSupport <= sampleCnt * 0.01 )
274 continue ;
275 }
276 /*if ( sampleCnt >= 50 )
277 {
278 // just some randomly intron.
279 if ( introns[i].sampleSupport == 1
280 && ( sites[a].associatedIntronCnt == 1 || sites[b].associatedIntronCnt == 1 ) )
281 continue ;
282 }*/
283
284 /*if ( introns[i].end - introns[i].start + 1 < 50 )
285 {
286 int needSample = MIN( ( 5 - ( introns[i].end - introns[i].start + 1 ) / 10 ) * unit, sampleCnt ) ;
287 int flag = 0 ;
288
289 if ( introns[i].sampleSupport < needSample )
290 flag = 1 ;
291 if ( flag == 1 )
292 continue ;
293 }*/
294 if ( introns[i].strand == '?' && sampleCnt == 1 &&
295 ( introns[i].support < 5 || introns[i].uniqSupport == 0 || introns[i].support < 2 * introns[i].editDist ) )
296 continue ;
297
298 // Since the strand is uncertain, the alinger may make different decision sample from sample.
299 // To keep this intron, one of its splice sites must be more supported than adjacent splice sites.
300 if ( introns[i].strand == '?' && introns[i].sampleSupport <= 0.5 * sampleCnt )
301 {
302 int s, e ;
303 int l ;
304 int cnt = 0 ;
305 for ( l = 0 ; l < 2 ; ++l )
306 {
307 int ind = ( l == 0 ) ? a : b ;
308 double max = sites[ind].support ;
309 int maxTag = ind ;
310 for ( s = ind - 1 ; s >= 0 && sites[s].chrId == sites[ind].chrId ; --s )
311 {
312 if ( sites[s].pos + 7 < sites[s + 1].pos )
313 break ;
314 if ( sites[s].support >= max )
315 {
316 max = sites[s].support ;
317 maxTag = s ;
318 }
319 }
320
321 for ( e = ind + 1 ; e < siteCnt && sites[e].chrId == sites[ind].chrId ; ++e )
322 {
323 if ( sites[e].pos - 7 > sites[e - 1].pos )
324 break ;
325 if ( sites[e].support >= max )
326 {
327 max = sites[e].support ;
328 maxTag = e ;
329 }
330 }
331
332 if ( maxTag == ind )
333 ++cnt ;
334 }
335 if ( cnt == 0 )
336 continue ;
337 }
338
339 // Test whether this a intron coming from a wrong strand
340 /*if ( b - a + 1 >= 10 && introns[i].strand != '?' && introns[i].sampleSupport <= 0.5 * sampleCnt )
341 {
342 int plusStrand = 0 ;
343 int minusStrand = 0 ;
344 int l ;
345 int s, e ;
346
347 if ( introns[i].strand == '+' )
348 plusStrand = 2 ;
349 else
350 minusStrand = 2 ;
351
352 for ( l = 0 ; l < 2 ; ++l )
353 {
354 int ind = ( l == 0 ) ? a : b ;
355 for ( s = ind - 1 ; s >= 0 && sites[s].chrId == sites[ind].chrId ; --s )
356 {
357 if ( sites[s].pos + 10000 < sites[s + 1].pos )
358 break ;
359
360 if ( sites[s].strand == '+' )
361 ++plusStrand ;
362 else if ( sites[s].strand == '-' )
363 ++minusStrand ;
364 }
365
366 for ( e = ind + 1 ; e < siteCnt && sites[e].chrId == sites[ind].chrId ; ++e )
367 {
368 if ( sites[e].pos - 10000 > sites[e - 1].pos )
369 break ;
370
371 if ( sites[e].strand == '+' )
372 ++plusStrand ;
373 else if ( sites[e].strand == '-' )
374 ++minusStrand ;
375 }
376 }
377
378 if ( introns[i].start == 161517978 )
379 printf( "capture: %d %d %d %d\n", a, b, minusStrand, plusStrand) ;
380
381 if ( introns[i].strand == '+' && minusStrand >= 20 && plusStrand == 2 )
382 continue ;
383 else if ( introns[i].strand == '-' && plusStrand >= 20 && minusStrand == 2 )
384 continue ;
385
386 }*/
387
388 // Filter a intron if one of its splice site associated with too many introns.
389 /*if ( sites[a].associatedIntronCnt >= 10 )
390 {
391 int needSample = MIN( sites[a].associatedIntronCnt / 100 * sampleCnt, sampleCnt ) ;
392 if ( introns[i].support < sites[a].support / 100 &&
393 introns[i].sampleSupport < needSample )
394 {
395 continue ;
396 }
397 }
398 if ( sites[b].associatedIntronCnt >= 10 )
399 {
400 int needSample = MIN( sites[b].associatedIntronCnt / 100 * sampleCnt, sampleCnt ) ;
401 if ( introns[i].support < sites[b].support / 100 &&
402 introns[i].sampleSupport < needSample )
403 {
404 continue ;
405 }
406 }*/
407
408
409 // Test for long intron
410 if ( introns[i].end - introns[i].start + 1 >= longIntronSize )
411 {
412 int needSample = MIN( ( ( introns[i].end - introns[i].start + 1 ) / longIntronSize + 1 ) * unit, sampleCnt ) ;
413 int flag = 0 ;
414 if ( (double)introns[i].uniqSupport / ( introns[i].uniqSupport + introns[i].secSupport ) < 0.1
415 || introns[i].uniqSupport / sampleCnt < 1
416 || introns[i].sampleSupport < needSample )
417 flag = 1 ;
418 if ( flag == 1 && introns[i].end - introns[i].start + 1 >= 3 * longIntronSize )
419 continue ;
420 if ( flag == 1 && ( sites[a].associatedIntronCnt > 1 || sites[b].associatedIntronCnt > 1 || introns[i].sampleSupport <= 1 ) ) // an intron may connect two genes
421 continue ;
422 }
423
424
425 printf( "%s %d %d 10 %c 10 0 0 0\n", alignments.GetChromName( introns[i].chrId ), introns[i].start, introns[i].end, introns[i].strand ) ;
426 }
427
428 return 0 ;
429 }