annotate src/sliding.c @ 4:c70137414dcd draft

sickle v1.33
author nikhil-joshi
date Wed, 23 Jul 2014 18:35:10 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
1 #include <assert.h>
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
2 #include <ctype.h>
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
3 #include <stdlib.h>
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
4 #include <zlib.h>
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
5 #include <stdio.h>
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
6 #include <getopt.h>
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
7 #include "sickle.h"
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
8 #include "kseq.h"
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
9
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
10 int get_quality_num (char qualchar, int qualtype, kseq_t *fqrec, int pos) {
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
11 /*
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
12 Return the adjusted quality, depending on quality type.
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
13
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
14 Note that this uses the array in sickle.h, which *approximates*
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
15 the SOLEXA (pre-1.3 pipeline) qualities as linear. This is
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
16 inaccurate with low-quality bases.
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
17 */
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
18
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
19 int qual_value = (int) qualchar;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
20
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
21 if (qual_value < quality_constants[qualtype][Q_MIN] || qual_value > quality_constants[qualtype][Q_MAX]) {
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
22 fprintf (stderr, "ERROR: Quality value (%d) does not fall within correct range for %s encoding.\n", qual_value, typenames[qualtype]);
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
23 fprintf (stderr, "Range for %s encoding: %d-%d\n", typenames[qualtype], quality_constants[qualtype][Q_MIN], quality_constants[qualtype][Q_MAX]);
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
24 fprintf (stderr, "FastQ record: %s\n", fqrec->name.s);
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
25 fprintf (stderr, "Quality string: %s\n", fqrec->qual.s);
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
26 fprintf (stderr, "Quality char: '%c'\n", qualchar);
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
27 fprintf (stderr, "Quality position: %d\n", pos+1);
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
28 exit(1);
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
29 }
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
30
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
31 return (qual_value - quality_constants[qualtype][Q_OFFSET]);
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
32 }
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
33
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
34
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
35 cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int debug) {
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
36
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
37 int window_size = (int) (0.1 * fqrec->seq.l);
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
38 int i,j;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
39 int window_start=0;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
40 int window_total=0;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
41 int three_prime_cut = fqrec->seq.l;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
42 int five_prime_cut = 0;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
43 int found_five_prime = 0;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
44 double window_avg;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
45 cutsites* retvals;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
46 char *npos;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
47
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
48 /* discard if the length of the sequence is less than the length threshold */
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
49 if (fqrec->seq.l < length_threshold) {
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
50 retvals = (cutsites*) malloc (sizeof(cutsites));
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
51 retvals->three_prime_cut = -1;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
52 retvals->five_prime_cut = -1;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
53 return (retvals);
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
54 }
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
55
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
56 /* if the seq length is less then 10bp, */
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
57 /* then make the window size the length of the seq */
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
58 if (window_size == 0) window_size = fqrec->seq.l;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
59
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
60 for (i=0; i<window_size; i++) {
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
61 window_total += get_quality_num (fqrec->qual.s[i], qualtype, fqrec, i);
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
62 }
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
63
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
64 for (i=0; i <= fqrec->qual.l - window_size; i++) {
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
65
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
66 window_avg = (double)window_total / (double)window_size;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
67
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
68 if (debug) printf ("no_fiveprime: %d, found 5prime: %d, window_avg: %f\n", no_fiveprime, found_five_prime, window_avg);
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
69
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
70 /* Finding the 5' cutoff */
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
71 /* Find when the average quality in the window goes above the threshold starting from the 5' end */
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
72 if (no_fiveprime == 0 && found_five_prime == 0 && window_avg >= qual_threshold) {
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
73
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
74 if (debug) printf ("inside 5-prime cut\n");
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
75
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
76 /* at what point in the window does the quality go above the threshold? */
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
77 for (j=window_start; j<window_start+window_size; j++) {
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
78 if (get_quality_num (fqrec->qual.s[j], qualtype, fqrec, j) >= qual_threshold) {
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
79 five_prime_cut = j;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
80 break;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
81 }
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
82 }
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
83
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
84 if (debug) printf ("five_prime_cut: %d\n", five_prime_cut);
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
85
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
86 found_five_prime = 1;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
87 }
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
88
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
89 /* Finding the 3' cutoff */
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
90 /* if the average quality in the window is less than the threshold */
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
91 /* or if the window is the last window in the read */
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
92 if ((window_avg < qual_threshold ||
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
93 window_start+window_size > fqrec->qual.l) && (found_five_prime == 1 || no_fiveprime)) {
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
94
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
95 /* at what point in the window does the quality dip below the threshold? */
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
96 for (j=window_start; j<window_start+window_size; j++) {
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
97 if (get_quality_num (fqrec->qual.s[j], qualtype, fqrec, j) < qual_threshold) {
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
98 three_prime_cut = j;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
99 break;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
100 }
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
101 }
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
102
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
103 break;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
104 }
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
105
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
106 /* instead of sliding the window, subtract the first qual and add the next qual */
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
107 window_total -= get_quality_num (fqrec->qual.s[window_start], qualtype, fqrec, window_start);
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
108 if (window_start+window_size < fqrec->qual.l) {
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
109 window_total += get_quality_num (fqrec->qual.s[window_start+window_size], qualtype, fqrec, window_start+window_size);
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
110 }
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
111 window_start++;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
112 }
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
113
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
114
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
115 /* If truncate N option is selected, and sequence has Ns, then */
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
116 /* change 3' cut site to be the base before the first N */
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
117 if (trunc_n && ((npos = strstr(fqrec->seq.s, "N")) || (npos = strstr(fqrec->seq.s, "n")))) {
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
118 three_prime_cut = npos - fqrec->seq.s;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
119 }
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
120
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
121 /* if cutting length is less than threshold then return -1 for both */
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
122 /* to indicate that the read should be discarded */
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
123 /* Also, if you never find a five prime cut site, then discard whole read */
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
124 if ((found_five_prime == 0 && !no_fiveprime) || (three_prime_cut - five_prime_cut < length_threshold)) {
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
125 three_prime_cut = -1;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
126 five_prime_cut = -1;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
127
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
128 if (debug) printf("%s\n", fqrec->name.s);
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
129 }
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
130
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
131 if (debug) printf ("\n\n");
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
132
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
133 retvals = (cutsites*) malloc (sizeof(cutsites));
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
134 retvals->three_prime_cut = three_prime_cut;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
135 retvals->five_prime_cut = five_prime_cut;
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
136 return (retvals);
c70137414dcd sickle v1.33
nikhil-joshi
parents:
diff changeset
137 }