0
|
1 #include <stdio.h>
|
|
2 #include <stdint.h>
|
|
3
|
|
4 #include <string.h>
|
|
5 #include <stdlib.h>
|
|
6 #include <vector>
|
|
7 #include <map>
|
|
8 #include <fstream>
|
|
9
|
|
10 char usage[] =
|
|
11 "./addXS ref.fa [OPTIONS] < in.sam > out.sam\n"
|
|
12 "From bam to bam: samtools view -h in.bam | ./addXS ref.fa | samtools view -bS - > out.bam\n" ;
|
|
13 char samLine[65537] ;
|
|
14 char seq[65537] ;
|
|
15
|
|
16 char nucToNum[26] = { 0, -1, 1, -1, -1, -1, 2,
|
|
17 -1, -1, -1, -1, -1, -1, 0, // Regard 'N' as 'A'
|
|
18 -1, -1, -1, -1, -1, 3,
|
|
19 -1, -1, -1, -1, -1, -1 } ;
|
|
20 char numToNuc[26] = {'A', 'C', 'G', 'T'} ;
|
|
21
|
|
22 struct _pair
|
|
23 {
|
|
24 int a, b ;
|
|
25 } ;
|
|
26
|
|
27 class BitSequence
|
|
28 {
|
|
29 private:
|
|
30 int len ;
|
|
31 //std::vector<uint32_t> sequence ;
|
|
32 uint32_t *sequence ;
|
|
33
|
|
34 public:
|
|
35 BitSequence() { len = 0 ; sequence = NULL ;}
|
|
36 BitSequence( int l )
|
|
37 {
|
|
38 len = 0 ;
|
|
39 sequence = new uint32_t[ l / 16 + 1 ] ;
|
|
40 }
|
|
41
|
|
42 ~BitSequence()
|
|
43 {
|
|
44 //if ( sequence != NULL )
|
|
45 // delete[] sequence ;
|
|
46 }
|
|
47
|
|
48 void SetLength( int l )
|
|
49 {
|
|
50 len = 0 ;
|
|
51 if ( sequence != NULL )
|
|
52 delete[] sequence ;
|
|
53 sequence = new uint32_t[ l / 16 + 1 ] ;
|
|
54 }
|
|
55
|
|
56 int GetLength()
|
|
57 {
|
|
58 return len ;
|
|
59 }
|
|
60
|
|
61 void Append( char c )
|
|
62 {
|
|
63 if ( ( len & 15 ) == 0 )
|
|
64 {
|
|
65 sequence[ len / 16 ] = 0 ;
|
|
66 }
|
|
67 ++len ;
|
|
68 //printf( "%d %c\n", len, c ) ;
|
|
69 Set( c, len - 1 ) ;
|
|
70 }
|
|
71
|
|
72 // pos is 0-based coordinate
|
|
73 // notice that the order within one 32 bit butcket is reversed
|
|
74 void Set( char c, int pos )
|
|
75 {
|
|
76 if ( pos >= len )
|
|
77 return ;
|
|
78
|
|
79 if ( c >= 'a' && c <= 'z' )
|
|
80 {
|
|
81 c = c - 'a' + 'A' ;
|
|
82 }
|
|
83 if ( c == 'N' )
|
|
84 c = 'A' ;
|
|
85
|
|
86 int ind = pos >> 4 ;
|
|
87 int offset = pos & 15 ;
|
|
88 int mask = ( (int)( nucToNum[c - 'A'] & 3 ) ) << ( 2 * offset ) ;
|
|
89 sequence[ind] = sequence[ind] | mask ;
|
|
90 //if ( c != 'A' )
|
|
91 // printf( "%d: %c %c %d %d : %d\n", pos, c, Get(pos), ind, offset, mask ) ;
|
|
92 //Print() ;
|
|
93 }
|
|
94
|
|
95 char Get( int pos )
|
|
96 {
|
|
97 if ( pos >= len )
|
|
98 return 'N' ;
|
|
99
|
|
100 int ind = pos >> 4 ;
|
|
101 int offset = pos & 15 ;
|
|
102 //printf( "%d: %d\n", pos, sequence[ind] ) ;
|
|
103 return numToNuc[ ( ( sequence[ind] >> ( 2 * offset ) ) & 3 ) ] ;
|
|
104 }
|
|
105
|
|
106 void Print()
|
|
107 {
|
|
108 int i ;
|
|
109 for ( i = 0 ; i < len ; ++i )
|
|
110 printf( "%c", Get( i ) ) ;
|
|
111 printf( "\n" ) ;
|
|
112 }
|
|
113
|
|
114 void Print( FILE *fp, int start, int end, bool rc )
|
|
115 {
|
|
116 if ( !rc )
|
|
117 {
|
|
118 for ( int i = start ; i <= end ; ++i )
|
|
119 fprintf( fp, "%c", Get( i ) ) ;
|
|
120 }
|
|
121 else
|
|
122 {
|
|
123 for ( int i = end ; i >= start ; --i )
|
|
124 {
|
|
125 char c = Get( i ) ;
|
|
126 if ( c == 'A' )
|
|
127 c = 'T' ;
|
|
128 else if ( c == 'C' )
|
|
129 c = 'G' ;
|
|
130 else if ( c == 'G' )
|
|
131 c = 'C' ;
|
|
132 else //if ( c == 'T' )
|
|
133 c = 'A' ;
|
|
134 fprintf( fp, "%c", c ) ;
|
|
135 }
|
|
136 }
|
|
137 }
|
|
138 } ;
|
|
139
|
|
140 void ReverseComplement( char *s, int l )
|
|
141 {
|
|
142 int u, v ;
|
|
143 for ( u = 0, v = l - 1 ; u <= v ; ++u, --v )
|
|
144 {
|
|
145 char tmp ;
|
|
146 tmp = s[v] ;
|
|
147 s[v] = s[u] ;
|
|
148 s[u] = tmp ;
|
|
149
|
|
150 s[u] = numToNuc[ 3 - nucToNum[ s[u] - 'A' ] ] ;
|
|
151 s[v] = numToNuc[ 3 - nucToNum[ s[v] - 'A' ] ] ;
|
|
152 }
|
|
153 }
|
|
154
|
|
155
|
|
156 int main( int argc, char *argv[] )
|
|
157 {
|
|
158 int i, j, k ;
|
|
159
|
|
160 std::vector<BitSequence> genome ;
|
|
161 std::map<std::string, int> chrToChrId ;
|
|
162
|
|
163 char readid[200], chrom[50], mapq[10], cigar[1000], mateChrom[50] ;
|
|
164 char buffer[1024] ;
|
|
165 char pattern[1024] ;
|
|
166 int start, mstart, flag, tlen ; // read start and mate read start
|
|
167 int seqLen ;
|
|
168 int chrCnt = 0 ;
|
|
169 int chrId ;
|
|
170 int width = 7 ;
|
|
171 int score ; // 0-bit: multiple aligned, 1-bit can shift the intron, 2-bit can shift the alignment, 3-bit contain sequencing error
|
|
172
|
|
173 if ( argc < 2 )
|
|
174 {
|
|
175 printf( "%s", usage ) ;
|
|
176 return 0 ;
|
|
177 }
|
|
178
|
|
179
|
|
180 std::ifstream fpRef ;
|
|
181 fpRef.open( argv[1] ) ;
|
|
182 std::string line ;
|
|
183
|
|
184 int motifStrand[1024] ;
|
|
185
|
|
186 /*motifStrand[ 0b10110010 ] = 1 ; // GT/AG
|
|
187 motifStrand[ 0b01110001 ] = 0 ; // CT/AC
|
|
188 motifStrand[ 0b10010010 ] = 1 ;// GC/AG
|
|
189 motifStrand[ 0b01111001 ] = 0 ; // CT/GC
|
|
190 motifStrand[ 0b00110001 ] = 1 ; // AT/AC
|
|
191 motifStrand[ 0b10110011 ] = 0 ; // GT/AT*/
|
|
192 memset( motifStrand, -1, sizeof( motifStrand )) ;
|
|
193 motifStrand[ 0xb2 ] = 1 ; // GT/AG
|
|
194 motifStrand[ 0x71 ] = 0 ; // CT/AC
|
|
195 motifStrand[ 0x92 ] = 1 ;// GC/AG
|
|
196 motifStrand[ 0x79 ] = 0 ; // CT/GC
|
|
197 motifStrand[ 0x31 ] = 1 ; // AT/AC
|
|
198 motifStrand[ 0xb3 ] = 0 ; // GT/AT
|
|
199
|
|
200 k = 0 ;
|
|
201 while ( getline( fpRef, line ) )
|
|
202 {
|
|
203 //printf( "%s\n", line.c_str() ) ;
|
|
204 int len = line.length() ;
|
|
205 if ( line[0] == '>' )
|
|
206 {
|
|
207 //char *s = strdup( line.c_str() + 1 ) ;
|
|
208 if ( chrCnt > 0 )
|
|
209 {
|
|
210 genome[ chrCnt - 1 ].SetLength( k ) ;
|
|
211 }
|
|
212 for ( i = 1 ; i < len ; ++i )
|
|
213 if ( line[i] == ' ' || line[i] == '\t' )
|
|
214 break ;
|
|
215 chrToChrId[ line.substr( 1, i - 1 ) ] = chrCnt ;
|
|
216 ++chrCnt ;
|
|
217
|
|
218 BitSequence bs ;
|
|
219 genome.push_back( bs ) ;
|
|
220
|
|
221 k = 0 ;
|
|
222 }
|
|
223 else
|
|
224 {
|
|
225 k += len ;
|
|
226 }
|
|
227 }
|
|
228 genome[ chrCnt - 1 ].SetLength( k ) ;
|
|
229 fpRef.close() ;
|
|
230
|
|
231 fpRef.open( argv[1] ) ;
|
|
232 while ( getline( fpRef, line ) )
|
|
233 {
|
|
234 //printf( "%s\n", line.c_str() ) ;
|
|
235 int len = line.length() ;
|
|
236 if ( line[0] == '>' )
|
|
237 {
|
|
238 //char *s = strdup( line.c_str() + 1 ) ;
|
|
239 for ( i = 1 ; i < len ; ++i )
|
|
240 if ( line[i] == ' ' || line[i] == '\t' )
|
|
241 break ;
|
|
242 chrId = chrToChrId[ line.substr( 1, i - 1 ) ] ;
|
|
243 }
|
|
244 else
|
|
245 {
|
|
246 BitSequence &bs = genome[chrId] ;
|
|
247 for ( i = 0 ; i < len ; ++i )
|
|
248 {
|
|
249 if ( ( line[i] >= 'A' && line[i] <= 'Z' ) ||
|
|
250 ( line[i] >= 'a' && line[i] <= 'z' ) )
|
|
251 bs.Append( line[i] ) ;
|
|
252 }
|
|
253 }
|
|
254 }
|
|
255 fpRef.close() ;
|
|
256
|
|
257
|
|
258
|
|
259 while ( fgets( samLine, sizeof( samLine ), stdin ) != NULL )
|
|
260 {
|
|
261 if ( samLine[0] == '@' )
|
|
262 {
|
|
263 printf( "%s", samLine ) ;
|
|
264 continue ;
|
|
265 }
|
|
266
|
|
267 sscanf( samLine, "%s %d %s %d %s %s %s %d %d %s", readid, &flag, chrom, &start, mapq, cigar, mateChrom, &mstart, &tlen, seq ) ;
|
|
268 struct _pair segments[1000] ;
|
|
269 struct _pair seqSegments[1000] ; // the segments with respect to the sequence.
|
|
270 int segCnt = 0 ;
|
|
271 int seqSegCnt = 0 ;
|
|
272 int num = 0 ;
|
|
273 int len = 0 ;
|
|
274
|
|
275 score = 0 ;
|
|
276
|
|
277 for ( i = 0 ; cigar[i] ; ++i )
|
|
278 {
|
|
279 if ( cigar[i] >= '0' && cigar[i] <= '9' )
|
|
280 num = num * 10 + cigar[i] - '0' ;
|
|
281 else if ( cigar[i] == 'I' || cigar[i] == 'S' || cigar[i] == 'H'
|
|
282 || cigar[i] == 'P' )
|
|
283 {
|
|
284 num = 0 ;
|
|
285 }
|
|
286 else if ( cigar[i] == 'N' )
|
|
287 {
|
|
288 segments[segCnt].a = start ;
|
|
289 segments[segCnt].b = start + len - 1 ;
|
|
290 ++segCnt ;
|
|
291
|
|
292 start = start + len + num ;
|
|
293 len = 0 ;
|
|
294 num = 0 ;
|
|
295 }
|
|
296 else
|
|
297 {
|
|
298 len += num ;
|
|
299 num = 0 ;
|
|
300 }
|
|
301 }
|
|
302
|
|
303
|
|
304 if ( len > 0 )
|
|
305 {
|
|
306 segments[segCnt].a = start ;
|
|
307 segments[segCnt].b = start + len - 1 ;
|
|
308 ++segCnt ;
|
|
309 }
|
|
310
|
|
311 if ( segCnt <= 1 || strstr( samLine, "XS:A:" ) != NULL )
|
|
312 {
|
|
313 printf( "%s", samLine ) ;
|
|
314 continue ;
|
|
315 }
|
|
316
|
|
317 std::string schr( chrom ) ;
|
|
318 int chrId = chrToChrId[schr] ;
|
|
319 int strand = -1 ;
|
|
320
|
|
321 len = strlen( samLine ) ;
|
|
322 samLine[len - 1] = '\0' ;
|
|
323 int motif = 0 ;
|
|
324
|
|
325 BitSequence &chrSeq = genome[ chrId ] ;
|
|
326
|
|
327 for ( i = 0 ; i < segCnt - 1 ; ++i )
|
|
328 {
|
|
329 char m[4] ;
|
|
330 m[0] = chrSeq.Get( segments[i].b + 1 - 1 ) ;
|
|
331 m[1] = chrSeq.Get( segments[i].b + 2 - 1 ) ;
|
|
332 m[2] = chrSeq.Get( segments[i + 1].a - 2 - 1 ) ;
|
|
333 m[3] = chrSeq.Get( segments[i + 1].a - 1 - 1 ) ;
|
|
334 motif = 0 ;
|
|
335 for ( j = 0 ; j < 4 ; ++j )
|
|
336 motif = ( motif << 2 ) + ( nucToNum[ m[j] - 'A' ] & 3 );
|
|
337 strand = motifStrand[ motif ] ;
|
|
338 if ( strand != -1 )
|
|
339 break ;
|
|
340 }
|
|
341 if ( strand == 1 )
|
|
342 {
|
|
343 printf( "%s\tXS:A:+\n", samLine ) ;
|
|
344 continue ;
|
|
345 }
|
|
346 else if ( strand == 0 )
|
|
347 {
|
|
348 printf( "%s\tXS:A:-\n", samLine ) ;
|
|
349 continue ;
|
|
350 }
|
|
351
|
|
352 char *p ;
|
|
353 if ( ( p = strstr( samLine, "NH:i:" ) ) != NULL )
|
|
354 {
|
|
355 p += 5 ;
|
|
356 num = 0 ;
|
|
357 while ( *p >= '0' && *p <= '9' )
|
|
358 {
|
|
359 num = num * 10 + *p - '0' ;
|
|
360 ++p ;
|
|
361 }
|
|
362 if ( num > 1 )
|
|
363 score |= 1 ;
|
|
364 }
|
|
365
|
|
366 seqLen = strlen( seq ) ;
|
|
367 for ( i = 0 ; i < seqLen ; ++i )
|
|
368 if ( seq[i] == 'N' )
|
|
369 {
|
|
370 score |= 1 ;
|
|
371 break ;
|
|
372 }
|
|
373
|
|
374 num = 0 ;
|
|
375 len = 0 ;
|
|
376 seqSegCnt = 0 ;
|
|
377 start = 0 ;
|
|
378 for ( i = 0 ; cigar[i] ; ++i )
|
|
379 {
|
|
380 if ( cigar[i] >= '0' && cigar[i] <= '9' )
|
|
381 num = num * 10 + cigar[i] - '0' ;
|
|
382 else if ( cigar[i] == 'D' || cigar[i] == 'S' || cigar[i] == 'H'
|
|
383 || cigar[i] == 'P' )
|
|
384 {
|
|
385 num = 0 ;
|
|
386 }
|
|
387 else if ( cigar[i] == 'N' )
|
|
388 {
|
|
389 seqSegments[seqSegCnt].a = start ;
|
|
390 seqSegments[seqSegCnt].b = start + len - 1 ;
|
|
391 ++seqSegCnt ;
|
|
392
|
|
393 start = start + len ;
|
|
394 len = 0 ;
|
|
395 num = 0 ;
|
|
396 }
|
|
397 else
|
|
398 {
|
|
399 len += num ;
|
|
400 num = 0 ;
|
|
401 }
|
|
402 }
|
|
403 if ( len > 0 )
|
|
404 {
|
|
405 seqSegments[seqSegCnt].a = start ;
|
|
406 seqSegments[seqSegCnt].b = start + len - 1 ;
|
|
407 ++seqSegCnt ;
|
|
408 }
|
|
409
|
|
410 // Check whether we can shift the intron to a canonical splice site
|
|
411 for ( i = 0 ; i < segCnt - 1 ; ++i )
|
|
412 {
|
|
413 for ( j = -width ; j < width ; ++j )
|
|
414 {
|
|
415 if ( segments[i].b + j < segments[i].a || segments[i+1].a + j > segments[i+1].b )
|
|
416 continue ;
|
|
417 // Look for the splice signal.
|
|
418 char m[4] ;
|
|
419 m[0] = chrSeq.Get( segments[i].b + j + 1 - 1 ) ;
|
|
420 m[1] = chrSeq.Get( segments[i].b + j + 2 - 1 ) ;
|
|
421 m[2] = chrSeq.Get( segments[i + 1].a + j - 2 - 1 ) ;
|
|
422 m[3] = chrSeq.Get( segments[i + 1].a + j - 1 - 1 ) ;
|
|
423 motif = 0 ;
|
|
424 for ( k = 0 ; k < 4 ; ++k )
|
|
425 motif = ( motif << 2 ) + ( nucToNum[ m[k] - 'A' ] & 3 );
|
|
426 strand = motifStrand[ motif ] ;
|
|
427 if ( strand == -1 )
|
|
428 continue ;
|
|
429 //printf( "%d %d: %d\n", i, j, motif ) ;
|
|
430
|
|
431 // We found a signal, then test whether we can shift the intron.
|
|
432 // Extract the sequence from the reference genome.
|
|
433 int l = 0 ;
|
|
434 if ( j < 0 )
|
|
435 for ( k = segments[i + 1].a + j, l = 0 ; k <= segments[i + 1].a - 1 ; ++k, ++l )
|
|
436 buffer[l] = chrSeq.Get( k - 1 ) ;
|
|
437 else
|
|
438 for ( k = segments[i].b + 1, l= 0 ; k <= segments[i].b + j ; ++k, ++l )
|
|
439 buffer[l] = chrSeq.Get(k - 1) ;
|
|
440 buffer[l] = '\0' ;
|
|
441
|
|
442 //printf( "%s\n", buffer ) ;
|
|
443
|
|
444 // Get the coordinate on the sequence.
|
|
445 int tag ;
|
|
446 int useBegin = 0 ; // is the portion from the beginning part of a seqSegment?
|
|
447 int from, to ;
|
|
448 if ( j < 0 )
|
|
449 {
|
|
450 tag = i ;
|
|
451 useBegin = 0 ;
|
|
452 }
|
|
453 else // j>0
|
|
454 {
|
|
455 tag = i + 1 ;
|
|
456 useBegin = 1 ;
|
|
457 }
|
|
458 //printf( "%d %d\n", tag, useBegin ) ;
|
|
459
|
|
460 if ( ( flag & 0x10 ) != 0 )
|
|
461 {
|
|
462 //TODO: it seems star already fliped the sequence.
|
|
463 // is it true for other aligners?
|
|
464
|
|
465 //tag = segCnt - 1 - tag ;
|
|
466 //useBegin = 1 - useBegin ;
|
|
467
|
|
468
|
|
469 //ReverseComplement( buffer, l ) ;
|
|
470 }
|
|
471
|
|
472 if ( useBegin )
|
|
473 {
|
|
474 from = seqSegments[tag].a ;
|
|
475 to = seqSegments[tag].a + l - 1 ;
|
|
476 }
|
|
477 else
|
|
478 {
|
|
479 from = seqSegments[tag].b - l + 1 ;
|
|
480 to = seqSegments[tag].b ;
|
|
481 }
|
|
482 //printf( "%d %d %d\n", tag, from, to ) ;
|
|
483
|
|
484 for ( k = 0 ; k < l ; ++k )
|
|
485 {
|
|
486 if ( seq[from + k] != buffer[k] )
|
|
487 break ;
|
|
488 }
|
|
489
|
|
490
|
|
491 if ( k >= l )
|
|
492 {
|
|
493 // We can shift the intron
|
|
494 score |= 2 ;
|
|
495 break ;
|
|
496 }
|
|
497 }
|
|
498 if ( j < width )
|
|
499 break ;
|
|
500 }
|
|
501
|
|
502 // Check whether the sequence is low-complexity. To do this, we test whether we can shift the seqSegments and directly inspect the sequence.
|
|
503 for ( i = 0 ; i < seqSegCnt ; ++i )
|
|
504 {
|
|
505 int l ;
|
|
506 for ( k = 0, j = seqSegments[i].a - width ; j <= seqSegments[i].b + width ; ++j, ++k )
|
|
507 buffer[k] = chrSeq.Get( j ) ;
|
|
508 buffer[k] = '0' ;
|
|
509
|
|
510 for ( l = 0, j = seqSegments[i].a ; j <= seqSegments[i].b ; ++j, ++l )
|
|
511 {
|
|
512 pattern[l] = seq[j] ;
|
|
513 }
|
|
514 pattern[l] = '\0' ;
|
|
515
|
|
516 // It seems the aligner already flipped the read in its output?
|
|
517 //if ( flag & 0x10 != 0 )
|
|
518 // ReverseComplement( pattern, l ) ;
|
|
519
|
|
520
|
|
521 char *p = buffer ;
|
|
522 int cnt = 0 ;
|
|
523 while ( ( p = strstr( p, pattern ) ) != NULL )
|
|
524 {
|
|
525 ++cnt ;
|
|
526 ++p ;
|
|
527 }
|
|
528 if ( cnt > 1 )
|
|
529 {
|
|
530 score |= 4 ;
|
|
531 break ;
|
|
532 }
|
|
533
|
|
534 int count[5] = { 0, 0, 0, 0, 0 } ;
|
|
535 int max = 0 ;
|
|
536 for ( j = 0 ; j < l ; ++j )
|
|
537 {
|
|
538 ++count[ nucToNum[ pattern[j] - 'A' ] ] ;
|
|
539 }
|
|
540
|
|
541 for ( j = 0 ; j < 5 ; ++j )
|
|
542 {
|
|
543 if ( count[j] > max )
|
|
544 max = count[j] ;
|
|
545 }
|
|
546 if ( max > 0.8 * l )
|
|
547 {
|
|
548 score |= 4 ;
|
|
549 break ;
|
|
550 }
|
|
551 }
|
|
552
|
|
553 if ( ( p = strstr( samLine, "NM:i:" ) ) != NULL || ( p = strstr( samLine, "nM:i:" ) ) != NULL )
|
|
554 {
|
|
555 int nm = atoi( p + 5 ) ;
|
|
556 if ( nm > 0 )
|
|
557 {
|
|
558 score |= 8 ;
|
|
559 }
|
|
560 }
|
|
561 else
|
|
562 {
|
|
563 // TODO: check the nm by ourself
|
|
564 }
|
|
565
|
|
566 // Check whether the alignment is concordant.
|
|
567 if ( ( flag & 0x1 ) != 0 && ( flag & 0x4 ) == 0 )
|
|
568 {
|
|
569 start = segments[0].a ;
|
|
570 if ( strcmp( mateChrom, "=" )
|
|
571 || ( flag & 0x10 ) == ( flag & 0x20 )
|
|
572 || ( ( flag & 0x10 ) == 0 && ( mstart < start || mstart > start + 2000000 ) )
|
|
573 || ( ( flag & 0x10 ) != 0 && ( mstart > start || mstart < start - 2000000 ) ) )
|
|
574 {
|
|
575 score |= 16 ;
|
|
576 }
|
|
577 }
|
|
578
|
|
579 printf( "%s\tXS:A:?\tYS:i:%d\n", samLine, score ) ;
|
|
580 }
|
|
581
|
|
582 return 0 ;
|
|
583 }
|