4
|
1 #include <assert.h>
|
|
2 #include <ctype.h>
|
|
3 #include <stdlib.h>
|
|
4 #include <zlib.h>
|
|
5 #include <stdio.h>
|
|
6 #include <getopt.h>
|
|
7 #include "sickle.h"
|
|
8 #include "kseq.h"
|
|
9
|
|
10 int get_quality_num (char qualchar, int qualtype, kseq_t *fqrec, int pos) {
|
|
11 /*
|
|
12 Return the adjusted quality, depending on quality type.
|
|
13
|
|
14 Note that this uses the array in sickle.h, which *approximates*
|
|
15 the SOLEXA (pre-1.3 pipeline) qualities as linear. This is
|
|
16 inaccurate with low-quality bases.
|
|
17 */
|
|
18
|
|
19 int qual_value = (int) qualchar;
|
|
20
|
|
21 if (qual_value < quality_constants[qualtype][Q_MIN] || qual_value > quality_constants[qualtype][Q_MAX]) {
|
|
22 fprintf (stderr, "ERROR: Quality value (%d) does not fall within correct range for %s encoding.\n", qual_value, typenames[qualtype]);
|
|
23 fprintf (stderr, "Range for %s encoding: %d-%d\n", typenames[qualtype], quality_constants[qualtype][Q_MIN], quality_constants[qualtype][Q_MAX]);
|
|
24 fprintf (stderr, "FastQ record: %s\n", fqrec->name.s);
|
|
25 fprintf (stderr, "Quality string: %s\n", fqrec->qual.s);
|
|
26 fprintf (stderr, "Quality char: '%c'\n", qualchar);
|
|
27 fprintf (stderr, "Quality position: %d\n", pos+1);
|
|
28 exit(1);
|
|
29 }
|
|
30
|
|
31 return (qual_value - quality_constants[qualtype][Q_OFFSET]);
|
|
32 }
|
|
33
|
|
34
|
|
35 cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int debug) {
|
|
36
|
|
37 int window_size = (int) (0.1 * fqrec->seq.l);
|
|
38 int i,j;
|
|
39 int window_start=0;
|
|
40 int window_total=0;
|
|
41 int three_prime_cut = fqrec->seq.l;
|
|
42 int five_prime_cut = 0;
|
|
43 int found_five_prime = 0;
|
|
44 double window_avg;
|
|
45 cutsites* retvals;
|
|
46 char *npos;
|
|
47
|
|
48 /* discard if the length of the sequence is less than the length threshold */
|
|
49 if (fqrec->seq.l < length_threshold) {
|
|
50 retvals = (cutsites*) malloc (sizeof(cutsites));
|
|
51 retvals->three_prime_cut = -1;
|
|
52 retvals->five_prime_cut = -1;
|
|
53 return (retvals);
|
|
54 }
|
|
55
|
|
56 /* if the seq length is less then 10bp, */
|
|
57 /* then make the window size the length of the seq */
|
|
58 if (window_size == 0) window_size = fqrec->seq.l;
|
|
59
|
|
60 for (i=0; i<window_size; i++) {
|
|
61 window_total += get_quality_num (fqrec->qual.s[i], qualtype, fqrec, i);
|
|
62 }
|
|
63
|
|
64 for (i=0; i <= fqrec->qual.l - window_size; i++) {
|
|
65
|
|
66 window_avg = (double)window_total / (double)window_size;
|
|
67
|
|
68 if (debug) printf ("no_fiveprime: %d, found 5prime: %d, window_avg: %f\n", no_fiveprime, found_five_prime, window_avg);
|
|
69
|
|
70 /* Finding the 5' cutoff */
|
|
71 /* Find when the average quality in the window goes above the threshold starting from the 5' end */
|
|
72 if (no_fiveprime == 0 && found_five_prime == 0 && window_avg >= qual_threshold) {
|
|
73
|
|
74 if (debug) printf ("inside 5-prime cut\n");
|
|
75
|
|
76 /* at what point in the window does the quality go above the threshold? */
|
|
77 for (j=window_start; j<window_start+window_size; j++) {
|
|
78 if (get_quality_num (fqrec->qual.s[j], qualtype, fqrec, j) >= qual_threshold) {
|
|
79 five_prime_cut = j;
|
|
80 break;
|
|
81 }
|
|
82 }
|
|
83
|
|
84 if (debug) printf ("five_prime_cut: %d\n", five_prime_cut);
|
|
85
|
|
86 found_five_prime = 1;
|
|
87 }
|
|
88
|
|
89 /* Finding the 3' cutoff */
|
|
90 /* if the average quality in the window is less than the threshold */
|
|
91 /* or if the window is the last window in the read */
|
|
92 if ((window_avg < qual_threshold ||
|
|
93 window_start+window_size > fqrec->qual.l) && (found_five_prime == 1 || no_fiveprime)) {
|
|
94
|
|
95 /* at what point in the window does the quality dip below the threshold? */
|
|
96 for (j=window_start; j<window_start+window_size; j++) {
|
|
97 if (get_quality_num (fqrec->qual.s[j], qualtype, fqrec, j) < qual_threshold) {
|
|
98 three_prime_cut = j;
|
|
99 break;
|
|
100 }
|
|
101 }
|
|
102
|
|
103 break;
|
|
104 }
|
|
105
|
|
106 /* instead of sliding the window, subtract the first qual and add the next qual */
|
|
107 window_total -= get_quality_num (fqrec->qual.s[window_start], qualtype, fqrec, window_start);
|
|
108 if (window_start+window_size < fqrec->qual.l) {
|
|
109 window_total += get_quality_num (fqrec->qual.s[window_start+window_size], qualtype, fqrec, window_start+window_size);
|
|
110 }
|
|
111 window_start++;
|
|
112 }
|
|
113
|
|
114
|
|
115 /* If truncate N option is selected, and sequence has Ns, then */
|
|
116 /* change 3' cut site to be the base before the first N */
|
|
117 if (trunc_n && ((npos = strstr(fqrec->seq.s, "N")) || (npos = strstr(fqrec->seq.s, "n")))) {
|
|
118 three_prime_cut = npos - fqrec->seq.s;
|
|
119 }
|
|
120
|
|
121 /* if cutting length is less than threshold then return -1 for both */
|
|
122 /* to indicate that the read should be discarded */
|
|
123 /* Also, if you never find a five prime cut site, then discard whole read */
|
|
124 if ((found_five_prime == 0 && !no_fiveprime) || (three_prime_cut - five_prime_cut < length_threshold)) {
|
|
125 three_prime_cut = -1;
|
|
126 five_prime_cut = -1;
|
|
127
|
|
128 if (debug) printf("%s\n", fqrec->name.s);
|
|
129 }
|
|
130
|
|
131 if (debug) printf ("\n\n");
|
|
132
|
|
133 retvals = (cutsites*) malloc (sizeof(cutsites));
|
|
134 retvals->three_prime_cut = three_prime_cut;
|
|
135 retvals->five_prime_cut = five_prime_cut;
|
|
136 return (retvals);
|
|
137 }
|