diff src/sliding.c @ 4:c70137414dcd draft

sickle v1.33
author nikhil-joshi
date Wed, 23 Jul 2014 18:35:10 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/sliding.c	Wed Jul 23 18:35:10 2014 -0400
@@ -0,0 +1,137 @@
+#include <assert.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <zlib.h>
+#include <stdio.h>
+#include <getopt.h>
+#include "sickle.h"
+#include "kseq.h"
+
+int get_quality_num (char qualchar, int qualtype, kseq_t *fqrec, int pos) {
+  /* 
+     Return the adjusted quality, depending on quality type.
+
+     Note that this uses the array in sickle.h, which *approximates*
+     the SOLEXA (pre-1.3 pipeline) qualities as linear. This is
+     inaccurate with low-quality bases.
+  */
+
+  int qual_value = (int) qualchar;
+
+  if (qual_value < quality_constants[qualtype][Q_MIN] || qual_value > quality_constants[qualtype][Q_MAX]) {
+	fprintf (stderr, "ERROR: Quality value (%d) does not fall within correct range for %s encoding.\n", qual_value, typenames[qualtype]);
+	fprintf (stderr, "Range for %s encoding: %d-%d\n", typenames[qualtype], quality_constants[qualtype][Q_MIN], quality_constants[qualtype][Q_MAX]);
+	fprintf (stderr, "FastQ record: %s\n", fqrec->name.s);
+	fprintf (stderr, "Quality string: %s\n", fqrec->qual.s);
+	fprintf (stderr, "Quality char: '%c'\n", qualchar);
+	fprintf (stderr, "Quality position: %d\n", pos+1);
+	exit(1);
+  }
+
+  return (qual_value - quality_constants[qualtype][Q_OFFSET]);
+}
+
+
+cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int debug) {
+
+	int window_size = (int) (0.1 * fqrec->seq.l);
+	int i,j;
+	int window_start=0;
+	int window_total=0;
+	int three_prime_cut = fqrec->seq.l;
+	int five_prime_cut = 0;
+	int found_five_prime = 0;
+	double window_avg;
+	cutsites* retvals;
+    char *npos;
+
+	/* discard if the length of the sequence is less than the length threshold */
+    if (fqrec->seq.l < length_threshold) {
+		retvals = (cutsites*) malloc (sizeof(cutsites));
+		retvals->three_prime_cut = -1;
+		retvals->five_prime_cut = -1;
+		return (retvals);
+	}
+
+	/* if the seq length is less then 10bp, */
+	/* then make the window size the length of the seq */
+	if (window_size == 0) window_size = fqrec->seq.l;
+
+	for (i=0; i<window_size; i++) {
+		window_total += get_quality_num (fqrec->qual.s[i], qualtype, fqrec, i);
+	}
+
+	for (i=0; i <= fqrec->qual.l - window_size; i++) {
+
+		window_avg = (double)window_total / (double)window_size;
+
+        if (debug) printf ("no_fiveprime: %d, found 5prime: %d, window_avg: %f\n", no_fiveprime, found_five_prime, window_avg);
+
+		/* Finding the 5' cutoff */
+		/* Find when the average quality in the window goes above the threshold starting from the 5' end */
+		if (no_fiveprime == 0 && found_five_prime == 0 && window_avg >= qual_threshold) {
+
+        if (debug) printf ("inside 5-prime cut\n");
+
+			/* at what point in the window does the quality go above the threshold? */
+			for (j=window_start; j<window_start+window_size; j++) {
+				if (get_quality_num (fqrec->qual.s[j], qualtype, fqrec, j) >= qual_threshold) {
+					five_prime_cut = j;
+					break;
+				}
+			}
+
+            if (debug) printf ("five_prime_cut: %d\n", five_prime_cut);
+
+			found_five_prime = 1;
+		}
+
+		/* Finding the 3' cutoff */
+		/* if the average quality in the window is less than the threshold */
+		/* or if the window is the last window in the read */
+		if ((window_avg < qual_threshold || 
+			window_start+window_size > fqrec->qual.l) && (found_five_prime == 1 || no_fiveprime)) {
+
+			/* at what point in the window does the quality dip below the threshold? */
+			for (j=window_start; j<window_start+window_size; j++) {
+				if (get_quality_num (fqrec->qual.s[j], qualtype, fqrec, j) < qual_threshold) {
+					three_prime_cut = j;
+					break;
+				}
+			}
+
+			break;
+		}
+
+		/* instead of sliding the window, subtract the first qual and add the next qual */
+		window_total -= get_quality_num (fqrec->qual.s[window_start], qualtype, fqrec, window_start);
+		if (window_start+window_size < fqrec->qual.l) {
+			window_total += get_quality_num (fqrec->qual.s[window_start+window_size], qualtype, fqrec, window_start+window_size);
+		}
+		window_start++;
+	}
+
+
+    /* If truncate N option is selected, and sequence has Ns, then */
+    /* change 3' cut site to be the base before the first N */
+    if (trunc_n && ((npos = strstr(fqrec->seq.s, "N")) || (npos = strstr(fqrec->seq.s, "n")))) {
+        three_prime_cut = npos - fqrec->seq.s;
+    }
+
+    /* if cutting length is less than threshold then return -1 for both */
+    /* to indicate that the read should be discarded */
+    /* Also, if you never find a five prime cut site, then discard whole read */
+    if ((found_five_prime == 0 && !no_fiveprime) || (three_prime_cut - five_prime_cut < length_threshold)) {
+        three_prime_cut = -1;
+        five_prime_cut = -1;
+
+        if (debug) printf("%s\n", fqrec->name.s);
+    }
+
+    if (debug) printf ("\n\n");
+
+	retvals = (cutsites*) malloc (sizeof(cutsites));
+	retvals->three_prime_cut = three_prime_cut;
+	retvals->five_prime_cut = five_prime_cut;
+	return (retvals);
+}