Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/AddXS.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #include <stdio.h> | |
2 #include <stdint.h> | |
3 | |
4 #include <string.h> | |
5 #include <stdlib.h> | |
6 #include <vector> | |
7 #include <map> | |
8 #include <fstream> | |
9 | |
10 char usage[] = | |
11 "./addXS ref.fa [OPTIONS] < in.sam > out.sam\n" | |
12 "From bam to bam: samtools view -h in.bam | ./addXS ref.fa | samtools view -bS - > out.bam\n" ; | |
13 char samLine[65537] ; | |
14 char seq[65537] ; | |
15 | |
16 char nucToNum[26] = { 0, -1, 1, -1, -1, -1, 2, | |
17 -1, -1, -1, -1, -1, -1, 0, // Regard 'N' as 'A' | |
18 -1, -1, -1, -1, -1, 3, | |
19 -1, -1, -1, -1, -1, -1 } ; | |
20 char numToNuc[26] = {'A', 'C', 'G', 'T'} ; | |
21 | |
22 struct _pair | |
23 { | |
24 int a, b ; | |
25 } ; | |
26 | |
27 class BitSequence | |
28 { | |
29 private: | |
30 int len ; | |
31 //std::vector<uint32_t> sequence ; | |
32 uint32_t *sequence ; | |
33 | |
34 public: | |
35 BitSequence() { len = 0 ; sequence = NULL ;} | |
36 BitSequence( int l ) | |
37 { | |
38 len = 0 ; | |
39 sequence = new uint32_t[ l / 16 + 1 ] ; | |
40 } | |
41 | |
42 ~BitSequence() | |
43 { | |
44 //if ( sequence != NULL ) | |
45 // delete[] sequence ; | |
46 } | |
47 | |
48 void SetLength( int l ) | |
49 { | |
50 len = 0 ; | |
51 if ( sequence != NULL ) | |
52 delete[] sequence ; | |
53 sequence = new uint32_t[ l / 16 + 1 ] ; | |
54 } | |
55 | |
56 int GetLength() | |
57 { | |
58 return len ; | |
59 } | |
60 | |
61 void Append( char c ) | |
62 { | |
63 if ( ( len & 15 ) == 0 ) | |
64 { | |
65 sequence[ len / 16 ] = 0 ; | |
66 } | |
67 ++len ; | |
68 //printf( "%d %c\n", len, c ) ; | |
69 Set( c, len - 1 ) ; | |
70 } | |
71 | |
72 // pos is 0-based coordinate | |
73 // notice that the order within one 32 bit butcket is reversed | |
74 void Set( char c, int pos ) | |
75 { | |
76 if ( pos >= len ) | |
77 return ; | |
78 | |
79 if ( c >= 'a' && c <= 'z' ) | |
80 { | |
81 c = c - 'a' + 'A' ; | |
82 } | |
83 if ( c == 'N' ) | |
84 c = 'A' ; | |
85 | |
86 int ind = pos >> 4 ; | |
87 int offset = pos & 15 ; | |
88 int mask = ( (int)( nucToNum[c - 'A'] & 3 ) ) << ( 2 * offset ) ; | |
89 sequence[ind] = sequence[ind] | mask ; | |
90 //if ( c != 'A' ) | |
91 // printf( "%d: %c %c %d %d : %d\n", pos, c, Get(pos), ind, offset, mask ) ; | |
92 //Print() ; | |
93 } | |
94 | |
95 char Get( int pos ) | |
96 { | |
97 if ( pos >= len ) | |
98 return 'N' ; | |
99 | |
100 int ind = pos >> 4 ; | |
101 int offset = pos & 15 ; | |
102 //printf( "%d: %d\n", pos, sequence[ind] ) ; | |
103 return numToNuc[ ( ( sequence[ind] >> ( 2 * offset ) ) & 3 ) ] ; | |
104 } | |
105 | |
106 void Print() | |
107 { | |
108 int i ; | |
109 for ( i = 0 ; i < len ; ++i ) | |
110 printf( "%c", Get( i ) ) ; | |
111 printf( "\n" ) ; | |
112 } | |
113 | |
114 void Print( FILE *fp, int start, int end, bool rc ) | |
115 { | |
116 if ( !rc ) | |
117 { | |
118 for ( int i = start ; i <= end ; ++i ) | |
119 fprintf( fp, "%c", Get( i ) ) ; | |
120 } | |
121 else | |
122 { | |
123 for ( int i = end ; i >= start ; --i ) | |
124 { | |
125 char c = Get( i ) ; | |
126 if ( c == 'A' ) | |
127 c = 'T' ; | |
128 else if ( c == 'C' ) | |
129 c = 'G' ; | |
130 else if ( c == 'G' ) | |
131 c = 'C' ; | |
132 else //if ( c == 'T' ) | |
133 c = 'A' ; | |
134 fprintf( fp, "%c", c ) ; | |
135 } | |
136 } | |
137 } | |
138 } ; | |
139 | |
140 void ReverseComplement( char *s, int l ) | |
141 { | |
142 int u, v ; | |
143 for ( u = 0, v = l - 1 ; u <= v ; ++u, --v ) | |
144 { | |
145 char tmp ; | |
146 tmp = s[v] ; | |
147 s[v] = s[u] ; | |
148 s[u] = tmp ; | |
149 | |
150 s[u] = numToNuc[ 3 - nucToNum[ s[u] - 'A' ] ] ; | |
151 s[v] = numToNuc[ 3 - nucToNum[ s[v] - 'A' ] ] ; | |
152 } | |
153 } | |
154 | |
155 | |
156 int main( int argc, char *argv[] ) | |
157 { | |
158 int i, j, k ; | |
159 | |
160 std::vector<BitSequence> genome ; | |
161 std::map<std::string, int> chrToChrId ; | |
162 | |
163 char readid[200], chrom[50], mapq[10], cigar[1000], mateChrom[50] ; | |
164 char buffer[1024] ; | |
165 char pattern[1024] ; | |
166 int start, mstart, flag, tlen ; // read start and mate read start | |
167 int seqLen ; | |
168 int chrCnt = 0 ; | |
169 int chrId ; | |
170 int width = 7 ; | |
171 int score ; // 0-bit: multiple aligned, 1-bit can shift the intron, 2-bit can shift the alignment, 3-bit contain sequencing error | |
172 | |
173 if ( argc < 2 ) | |
174 { | |
175 printf( "%s", usage ) ; | |
176 return 0 ; | |
177 } | |
178 | |
179 | |
180 std::ifstream fpRef ; | |
181 fpRef.open( argv[1] ) ; | |
182 std::string line ; | |
183 | |
184 int motifStrand[1024] ; | |
185 | |
186 /*motifStrand[ 0b10110010 ] = 1 ; // GT/AG | |
187 motifStrand[ 0b01110001 ] = 0 ; // CT/AC | |
188 motifStrand[ 0b10010010 ] = 1 ;// GC/AG | |
189 motifStrand[ 0b01111001 ] = 0 ; // CT/GC | |
190 motifStrand[ 0b00110001 ] = 1 ; // AT/AC | |
191 motifStrand[ 0b10110011 ] = 0 ; // GT/AT*/ | |
192 memset( motifStrand, -1, sizeof( motifStrand )) ; | |
193 motifStrand[ 0xb2 ] = 1 ; // GT/AG | |
194 motifStrand[ 0x71 ] = 0 ; // CT/AC | |
195 motifStrand[ 0x92 ] = 1 ;// GC/AG | |
196 motifStrand[ 0x79 ] = 0 ; // CT/GC | |
197 motifStrand[ 0x31 ] = 1 ; // AT/AC | |
198 motifStrand[ 0xb3 ] = 0 ; // GT/AT | |
199 | |
200 k = 0 ; | |
201 while ( getline( fpRef, line ) ) | |
202 { | |
203 //printf( "%s\n", line.c_str() ) ; | |
204 int len = line.length() ; | |
205 if ( line[0] == '>' ) | |
206 { | |
207 //char *s = strdup( line.c_str() + 1 ) ; | |
208 if ( chrCnt > 0 ) | |
209 { | |
210 genome[ chrCnt - 1 ].SetLength( k ) ; | |
211 } | |
212 for ( i = 1 ; i < len ; ++i ) | |
213 if ( line[i] == ' ' || line[i] == '\t' ) | |
214 break ; | |
215 chrToChrId[ line.substr( 1, i - 1 ) ] = chrCnt ; | |
216 ++chrCnt ; | |
217 | |
218 BitSequence bs ; | |
219 genome.push_back( bs ) ; | |
220 | |
221 k = 0 ; | |
222 } | |
223 else | |
224 { | |
225 k += len ; | |
226 } | |
227 } | |
228 genome[ chrCnt - 1 ].SetLength( k ) ; | |
229 fpRef.close() ; | |
230 | |
231 fpRef.open( argv[1] ) ; | |
232 while ( getline( fpRef, line ) ) | |
233 { | |
234 //printf( "%s\n", line.c_str() ) ; | |
235 int len = line.length() ; | |
236 if ( line[0] == '>' ) | |
237 { | |
238 //char *s = strdup( line.c_str() + 1 ) ; | |
239 for ( i = 1 ; i < len ; ++i ) | |
240 if ( line[i] == ' ' || line[i] == '\t' ) | |
241 break ; | |
242 chrId = chrToChrId[ line.substr( 1, i - 1 ) ] ; | |
243 } | |
244 else | |
245 { | |
246 BitSequence &bs = genome[chrId] ; | |
247 for ( i = 0 ; i < len ; ++i ) | |
248 { | |
249 if ( ( line[i] >= 'A' && line[i] <= 'Z' ) || | |
250 ( line[i] >= 'a' && line[i] <= 'z' ) ) | |
251 bs.Append( line[i] ) ; | |
252 } | |
253 } | |
254 } | |
255 fpRef.close() ; | |
256 | |
257 | |
258 | |
259 while ( fgets( samLine, sizeof( samLine ), stdin ) != NULL ) | |
260 { | |
261 if ( samLine[0] == '@' ) | |
262 { | |
263 printf( "%s", samLine ) ; | |
264 continue ; | |
265 } | |
266 | |
267 sscanf( samLine, "%s %d %s %d %s %s %s %d %d %s", readid, &flag, chrom, &start, mapq, cigar, mateChrom, &mstart, &tlen, seq ) ; | |
268 struct _pair segments[1000] ; | |
269 struct _pair seqSegments[1000] ; // the segments with respect to the sequence. | |
270 int segCnt = 0 ; | |
271 int seqSegCnt = 0 ; | |
272 int num = 0 ; | |
273 int len = 0 ; | |
274 | |
275 score = 0 ; | |
276 | |
277 for ( i = 0 ; cigar[i] ; ++i ) | |
278 { | |
279 if ( cigar[i] >= '0' && cigar[i] <= '9' ) | |
280 num = num * 10 + cigar[i] - '0' ; | |
281 else if ( cigar[i] == 'I' || cigar[i] == 'S' || cigar[i] == 'H' | |
282 || cigar[i] == 'P' ) | |
283 { | |
284 num = 0 ; | |
285 } | |
286 else if ( cigar[i] == 'N' ) | |
287 { | |
288 segments[segCnt].a = start ; | |
289 segments[segCnt].b = start + len - 1 ; | |
290 ++segCnt ; | |
291 | |
292 start = start + len + num ; | |
293 len = 0 ; | |
294 num = 0 ; | |
295 } | |
296 else | |
297 { | |
298 len += num ; | |
299 num = 0 ; | |
300 } | |
301 } | |
302 | |
303 | |
304 if ( len > 0 ) | |
305 { | |
306 segments[segCnt].a = start ; | |
307 segments[segCnt].b = start + len - 1 ; | |
308 ++segCnt ; | |
309 } | |
310 | |
311 if ( segCnt <= 1 || strstr( samLine, "XS:A:" ) != NULL ) | |
312 { | |
313 printf( "%s", samLine ) ; | |
314 continue ; | |
315 } | |
316 | |
317 std::string schr( chrom ) ; | |
318 int chrId = chrToChrId[schr] ; | |
319 int strand = -1 ; | |
320 | |
321 len = strlen( samLine ) ; | |
322 samLine[len - 1] = '\0' ; | |
323 int motif = 0 ; | |
324 | |
325 BitSequence &chrSeq = genome[ chrId ] ; | |
326 | |
327 for ( i = 0 ; i < segCnt - 1 ; ++i ) | |
328 { | |
329 char m[4] ; | |
330 m[0] = chrSeq.Get( segments[i].b + 1 - 1 ) ; | |
331 m[1] = chrSeq.Get( segments[i].b + 2 - 1 ) ; | |
332 m[2] = chrSeq.Get( segments[i + 1].a - 2 - 1 ) ; | |
333 m[3] = chrSeq.Get( segments[i + 1].a - 1 - 1 ) ; | |
334 motif = 0 ; | |
335 for ( j = 0 ; j < 4 ; ++j ) | |
336 motif = ( motif << 2 ) + ( nucToNum[ m[j] - 'A' ] & 3 ); | |
337 strand = motifStrand[ motif ] ; | |
338 if ( strand != -1 ) | |
339 break ; | |
340 } | |
341 if ( strand == 1 ) | |
342 { | |
343 printf( "%s\tXS:A:+\n", samLine ) ; | |
344 continue ; | |
345 } | |
346 else if ( strand == 0 ) | |
347 { | |
348 printf( "%s\tXS:A:-\n", samLine ) ; | |
349 continue ; | |
350 } | |
351 | |
352 char *p ; | |
353 if ( ( p = strstr( samLine, "NH:i:" ) ) != NULL ) | |
354 { | |
355 p += 5 ; | |
356 num = 0 ; | |
357 while ( *p >= '0' && *p <= '9' ) | |
358 { | |
359 num = num * 10 + *p - '0' ; | |
360 ++p ; | |
361 } | |
362 if ( num > 1 ) | |
363 score |= 1 ; | |
364 } | |
365 | |
366 seqLen = strlen( seq ) ; | |
367 for ( i = 0 ; i < seqLen ; ++i ) | |
368 if ( seq[i] == 'N' ) | |
369 { | |
370 score |= 1 ; | |
371 break ; | |
372 } | |
373 | |
374 num = 0 ; | |
375 len = 0 ; | |
376 seqSegCnt = 0 ; | |
377 start = 0 ; | |
378 for ( i = 0 ; cigar[i] ; ++i ) | |
379 { | |
380 if ( cigar[i] >= '0' && cigar[i] <= '9' ) | |
381 num = num * 10 + cigar[i] - '0' ; | |
382 else if ( cigar[i] == 'D' || cigar[i] == 'S' || cigar[i] == 'H' | |
383 || cigar[i] == 'P' ) | |
384 { | |
385 num = 0 ; | |
386 } | |
387 else if ( cigar[i] == 'N' ) | |
388 { | |
389 seqSegments[seqSegCnt].a = start ; | |
390 seqSegments[seqSegCnt].b = start + len - 1 ; | |
391 ++seqSegCnt ; | |
392 | |
393 start = start + len ; | |
394 len = 0 ; | |
395 num = 0 ; | |
396 } | |
397 else | |
398 { | |
399 len += num ; | |
400 num = 0 ; | |
401 } | |
402 } | |
403 if ( len > 0 ) | |
404 { | |
405 seqSegments[seqSegCnt].a = start ; | |
406 seqSegments[seqSegCnt].b = start + len - 1 ; | |
407 ++seqSegCnt ; | |
408 } | |
409 | |
410 // Check whether we can shift the intron to a canonical splice site | |
411 for ( i = 0 ; i < segCnt - 1 ; ++i ) | |
412 { | |
413 for ( j = -width ; j < width ; ++j ) | |
414 { | |
415 if ( segments[i].b + j < segments[i].a || segments[i+1].a + j > segments[i+1].b ) | |
416 continue ; | |
417 // Look for the splice signal. | |
418 char m[4] ; | |
419 m[0] = chrSeq.Get( segments[i].b + j + 1 - 1 ) ; | |
420 m[1] = chrSeq.Get( segments[i].b + j + 2 - 1 ) ; | |
421 m[2] = chrSeq.Get( segments[i + 1].a + j - 2 - 1 ) ; | |
422 m[3] = chrSeq.Get( segments[i + 1].a + j - 1 - 1 ) ; | |
423 motif = 0 ; | |
424 for ( k = 0 ; k < 4 ; ++k ) | |
425 motif = ( motif << 2 ) + ( nucToNum[ m[k] - 'A' ] & 3 ); | |
426 strand = motifStrand[ motif ] ; | |
427 if ( strand == -1 ) | |
428 continue ; | |
429 //printf( "%d %d: %d\n", i, j, motif ) ; | |
430 | |
431 // We found a signal, then test whether we can shift the intron. | |
432 // Extract the sequence from the reference genome. | |
433 int l = 0 ; | |
434 if ( j < 0 ) | |
435 for ( k = segments[i + 1].a + j, l = 0 ; k <= segments[i + 1].a - 1 ; ++k, ++l ) | |
436 buffer[l] = chrSeq.Get( k - 1 ) ; | |
437 else | |
438 for ( k = segments[i].b + 1, l= 0 ; k <= segments[i].b + j ; ++k, ++l ) | |
439 buffer[l] = chrSeq.Get(k - 1) ; | |
440 buffer[l] = '\0' ; | |
441 | |
442 //printf( "%s\n", buffer ) ; | |
443 | |
444 // Get the coordinate on the sequence. | |
445 int tag ; | |
446 int useBegin = 0 ; // is the portion from the beginning part of a seqSegment? | |
447 int from, to ; | |
448 if ( j < 0 ) | |
449 { | |
450 tag = i ; | |
451 useBegin = 0 ; | |
452 } | |
453 else // j>0 | |
454 { | |
455 tag = i + 1 ; | |
456 useBegin = 1 ; | |
457 } | |
458 //printf( "%d %d\n", tag, useBegin ) ; | |
459 | |
460 if ( ( flag & 0x10 ) != 0 ) | |
461 { | |
462 //TODO: it seems star already fliped the sequence. | |
463 // is it true for other aligners? | |
464 | |
465 //tag = segCnt - 1 - tag ; | |
466 //useBegin = 1 - useBegin ; | |
467 | |
468 | |
469 //ReverseComplement( buffer, l ) ; | |
470 } | |
471 | |
472 if ( useBegin ) | |
473 { | |
474 from = seqSegments[tag].a ; | |
475 to = seqSegments[tag].a + l - 1 ; | |
476 } | |
477 else | |
478 { | |
479 from = seqSegments[tag].b - l + 1 ; | |
480 to = seqSegments[tag].b ; | |
481 } | |
482 //printf( "%d %d %d\n", tag, from, to ) ; | |
483 | |
484 for ( k = 0 ; k < l ; ++k ) | |
485 { | |
486 if ( seq[from + k] != buffer[k] ) | |
487 break ; | |
488 } | |
489 | |
490 | |
491 if ( k >= l ) | |
492 { | |
493 // We can shift the intron | |
494 score |= 2 ; | |
495 break ; | |
496 } | |
497 } | |
498 if ( j < width ) | |
499 break ; | |
500 } | |
501 | |
502 // Check whether the sequence is low-complexity. To do this, we test whether we can shift the seqSegments and directly inspect the sequence. | |
503 for ( i = 0 ; i < seqSegCnt ; ++i ) | |
504 { | |
505 int l ; | |
506 for ( k = 0, j = seqSegments[i].a - width ; j <= seqSegments[i].b + width ; ++j, ++k ) | |
507 buffer[k] = chrSeq.Get( j ) ; | |
508 buffer[k] = '0' ; | |
509 | |
510 for ( l = 0, j = seqSegments[i].a ; j <= seqSegments[i].b ; ++j, ++l ) | |
511 { | |
512 pattern[l] = seq[j] ; | |
513 } | |
514 pattern[l] = '\0' ; | |
515 | |
516 // It seems the aligner already flipped the read in its output? | |
517 //if ( flag & 0x10 != 0 ) | |
518 // ReverseComplement( pattern, l ) ; | |
519 | |
520 | |
521 char *p = buffer ; | |
522 int cnt = 0 ; | |
523 while ( ( p = strstr( p, pattern ) ) != NULL ) | |
524 { | |
525 ++cnt ; | |
526 ++p ; | |
527 } | |
528 if ( cnt > 1 ) | |
529 { | |
530 score |= 4 ; | |
531 break ; | |
532 } | |
533 | |
534 int count[5] = { 0, 0, 0, 0, 0 } ; | |
535 int max = 0 ; | |
536 for ( j = 0 ; j < l ; ++j ) | |
537 { | |
538 ++count[ nucToNum[ pattern[j] - 'A' ] ] ; | |
539 } | |
540 | |
541 for ( j = 0 ; j < 5 ; ++j ) | |
542 { | |
543 if ( count[j] > max ) | |
544 max = count[j] ; | |
545 } | |
546 if ( max > 0.8 * l ) | |
547 { | |
548 score |= 4 ; | |
549 break ; | |
550 } | |
551 } | |
552 | |
553 if ( ( p = strstr( samLine, "NM:i:" ) ) != NULL || ( p = strstr( samLine, "nM:i:" ) ) != NULL ) | |
554 { | |
555 int nm = atoi( p + 5 ) ; | |
556 if ( nm > 0 ) | |
557 { | |
558 score |= 8 ; | |
559 } | |
560 } | |
561 else | |
562 { | |
563 // TODO: check the nm by ourself | |
564 } | |
565 | |
566 // Check whether the alignment is concordant. | |
567 if ( ( flag & 0x1 ) != 0 && ( flag & 0x4 ) == 0 ) | |
568 { | |
569 start = segments[0].a ; | |
570 if ( strcmp( mateChrom, "=" ) | |
571 || ( flag & 0x10 ) == ( flag & 0x20 ) | |
572 || ( ( flag & 0x10 ) == 0 && ( mstart < start || mstart > start + 2000000 ) ) | |
573 || ( ( flag & 0x10 ) != 0 && ( mstart > start || mstart < start - 2000000 ) ) ) | |
574 { | |
575 score |= 16 ; | |
576 } | |
577 } | |
578 | |
579 printf( "%s\tXS:A:?\tYS:i:%d\n", samLine, score ) ; | |
580 } | |
581 | |
582 return 0 ; | |
583 } |