diff srf2fastq/io_lib-1.12.2/progs/extract_qual.c @ 0:d901c9f41a6a default tip

Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author dawe
date Tue, 07 Jun 2011 17:48:05 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/srf2fastq/io_lib-1.12.2/progs/extract_qual.c	Tue Jun 07 17:48:05 2011 -0400
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) Medical Research Council 1994-1999. All rights reserved.
+ *
+ * Permission to use, copy, modify and distribute this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * this copyright and notice appears in all copies.
+ *
+ * This file was written by James Bonfield, Simon Dear, Rodger Staden,
+ * as part of the Staden Package at the MRC Laboratory of Molecular
+ * Biology, Hills Road, Cambridge, CB2 2QH, United Kingdom.
+ *
+ * MRC disclaims all warranties with regard to this software.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <strings.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <io_lib/Read.h>
+#include <io_lib/traceType.h>
+#include <io_lib/expFileIO.h>
+#include <io_lib/open_trace_file.h>
+#include <io_lib/xalloc.h>
+
+/* #include "stdio_hack.h" */
+
+#define LINE_LENGTH 60
+
+/*
+ * Converts the confidence array to the accuracy value string (AV).
+ *
+ * Note no memory overrun checks are performed on buf. It is recommended
+ * that it is allocated to 4*len (worst case of "100 " for each base).
+ *
+ * Returns the buf argument.
+ */
+static char *my_conf2str(int1 *conf, int len, char *buf) {
+    int i;
+    char *ret = buf, *rs = buf;
+
+    for (i = 0; i < len; i++) {
+	sprintf(buf, "%d ", conf[i]);
+	buf += strlen(buf);
+
+	if (buf - rs > LINE_LENGTH) {
+	    *buf++ = '\n';
+	    *buf = '\0';
+	    rs = buf;
+	}
+    }
+
+    return ret;
+}
+
+static int do_trans(mFILE *infp, char *in_file, FILE *outfp, int format,
+		    int good_only, int clip_cosmid, int fasta_out) {
+    Read *r;
+    char *tmp_prob_A, *tmp_prob_C, *tmp_prob_G, *tmp_prob_T;
+    char *cstr = NULL;
+
+    read_sections(READ_BASES);
+    if (NULL == (r = mfread_reading(infp, in_file, format))) {
+	fprintf(stderr, "Failed to read file '%s'\n", in_file);
+	return 1;
+    }
+
+    tmp_prob_A = r->prob_A;
+    tmp_prob_C = r->prob_C;
+    tmp_prob_G = r->prob_G;
+    tmp_prob_T = r->prob_T;
+
+#ifdef IOLIB_EXP
+    if (good_only && r->orig_trace_format == TT_EXP) {
+	int left=0, right=r->NBases + 1, val, lval, rval;
+	Exp_info *e = (Exp_info *)r->orig_trace;
+
+	if (0 == exp_get_int(e, EFLT_SL, &val))
+	    if (val > left)
+		left = val;
+	if (0 == exp_get_int(e, EFLT_QL, &val))
+	    if (val > left)
+		left = val;
+
+	if (0 == exp_get_int(e, EFLT_SR, &val))
+	    if (val < right)
+		right = val;
+	if (0 == exp_get_int(e, EFLT_QR, &val))
+	    if (val < right)
+		right = val;
+
+	/* This is horrid - see gap seqInfo.c file for explaination */
+	if (clip_cosmid) {
+	    int got_cosmid;
+
+	    if (0 == exp_get_rng(e, EFLT_CS, &lval, &rval)) {
+		got_cosmid = 1;
+	    } else if (0 == exp_get_int(e, EFLT_CL, &lval) &&
+		       0 == exp_get_int(e, EFLT_CR, &rval)) {
+		got_cosmid = 1;
+	    } else {
+		got_cosmid = 0;
+	    }
+	
+	    if (got_cosmid) {
+		if      (lval <= left   && rval <= left)    ;
+		else if (lval <= left+1 && rval <  right)   left  = rval;
+		else if (lval <= left+1 && rval >= right)   right = left+1;
+		else if (lval <  right  && rval <  right)   right = lval;
+		else if (lval <  right  && rval >= right)   right = lval;
+	    }
+	}
+
+        r->prob_A += left;
+        r->prob_C += left;
+        r->prob_G += left;
+        r->prob_T += left;
+	r->NBases = right - left - 1;
+    } else
+#endif /* IOLIB_EXP */
+    if (good_only) {
+        r->prob_A += r->leftCutoff;
+        r->prob_C += r->leftCutoff;
+        r->prob_G += r->leftCutoff;
+        r->prob_T += r->leftCutoff;
+	r->NBases = r->rightCutoff - r->leftCutoff - 1;
+    }
+
+    /* Confidence values */
+    if (r->prob_A && r->prob_C && r->prob_G && r->prob_T &&
+	r->NBases > 0) {
+	int i;
+
+	/* We have some, but are they non zero values? */
+        for (i = 0; i < r->NBases; i++) {
+	    if (r->prob_A[i] || r->prob_C[i] ||
+		r->prob_G[i] || r->prob_T[i])
+		break;
+	}
+	if (i != r->NBases) {
+	    int1 *conf = (int1 *)xmalloc(r->NBases);
+	    cstr = (char *)xmalloc(r->NBases * 4 + 2);
+
+	    for (i = 0; i < r->NBases; i++) {
+		switch (r->base[i]) {
+		case 'a':
+		case 'A':
+		    conf[i] = r->prob_A[i];
+		    break;
+		case 'c':
+		case 'C':
+		    conf[i] = r->prob_C[i];
+		    break;
+		case 'g':
+		case 'G':
+		    conf[i] = r->prob_G[i];
+		    break;
+		case 't':
+		case 'T':
+		    conf[i] = r->prob_T[i];
+		    break;
+                case 'b':
+                case 'B':
+		    conf[i] = (r->prob_C[i] + r->prob_G[i] + r->prob_T[i]) / 3;
+                    break;
+                case 'd':
+                case 'D':
+		    conf[i] = (r->prob_A[i] + r->prob_G[i] + r->prob_T[i]) / 3;
+                    break;
+                case 'h':
+                case 'H':
+		    conf[i] = (r->prob_A[i] + r->prob_C[i] + r->prob_T[i]) / 3;
+                    break;
+                case 'v':
+                case 'V':
+		    conf[i] = (r->prob_A[i] + r->prob_C[i] + r->prob_G[i]) / 3;
+                    break;
+                case 'k':
+                case 'K':
+		    conf[i] = (r->prob_G[i] + r->prob_T[i]) / 2;
+                    break;
+                case 'm':
+                case 'M':
+		    conf[i] = (r->prob_A[i] + r->prob_C[i]) / 2;
+                    break;
+                case 'r':
+                case 'R':
+		    conf[i] = (r->prob_A[i] + r->prob_G[i]) / 2;
+                    break;
+                case 's':
+                case 'S':
+		    conf[i] = (r->prob_C[i] + r->prob_G[i]) / 2;
+                    break;
+                case 'w':
+                case 'W':
+		    conf[i] = (r->prob_A[i] + r->prob_T[i]) / 2;
+                    break;
+                case 'y':
+                case 'Y':
+		    conf[i] = (r->prob_C[i] + r->prob_T[i]) / 2;
+                    break;
+		default:
+		    conf[i] = (r->prob_A[i] + r->prob_C[i] + r->prob_G[i] + r->prob_T[i]) / 4;
+		}
+	    }
+
+	    my_conf2str(conf, r->NBases, cstr);
+	    xfree(conf);
+	}
+    }
+
+    if (fasta_out) {
+	char *p = strrchr(in_file, '/');
+	/* Add header */
+	if (NULL == p)
+	    p = in_file;
+	else
+	    p++;
+	fprintf(outfp, ">%s\n", p); 
+    }
+
+    if (cstr) {
+      fprintf(outfp,"%s\n", cstr);
+      xfree(cstr);
+    }
+
+    r->prob_A = tmp_prob_A;
+    r->prob_C = tmp_prob_C;
+    r->prob_G = tmp_prob_G;
+    r->prob_T = tmp_prob_T;
+    read_deallocate(r);
+    fflush(outfp);
+
+    return 0;
+}
+
+static void usage(void) {
+    fprintf(stderr, "Usage: extract_qual [-r] [-(abi|alf|scf|exp|pln|ctf|ztr)]\n"
+	    "                   [-good_only] [-clip_cosmid] [-fasta_out]\n"
+	    "                   [-output output_name] [input_name] ...\n");
+    exit(1);
+}
+
+int main(int argc, char **argv) {
+    int from_stdin = 1;
+    mFILE *infp = mstdin();
+    FILE *outfp = stdout;
+    int format = TT_ANY;
+    int redirect = 1;
+    int good_only = 0;
+    int clip_cosmid = 0;
+    int fasta_out = 0;
+    int ret = 0;
+    char *fofn = NULL;
+
+    for (argc--, argv++; argc > 0; argc--, argv++) {
+	if (strcmp(*argv, "-r") == 0) {
+	    redirect = 0;
+	} else if (strcasecmp(*argv, "-abi") == 0) {
+            format = TT_ABI;
+        } else if (strcasecmp(*argv, "-alf") == 0) {
+            format = TT_ALF;
+        } else if (strcasecmp(*argv, "-scf") == 0) {
+            format = TT_SCF;
+        } else if (strcasecmp(*argv, "-exp") == 0) {
+            format = TT_EXP;
+        } else if (strcasecmp(*argv, "-pln") == 0) {
+            format = TT_PLN;
+        } else if (strcasecmp(*argv, "-ztr") == 0) {
+            format = TT_ZTR;
+        } else if (strcasecmp(*argv, "-ctf") == 0) {
+            format = TT_CTF;
+        } else if (strcasecmp(*argv, "-good_only") == 0) {
+	    good_only = 1;
+        } else if (strcasecmp(*argv, "-clip_cosmid") == 0) {
+	    clip_cosmid = 1;
+	} else if (strcasecmp(*argv, "-fasta_out") == 0) {
+	    fasta_out = 1;
+	} else if (strcmp(*argv, "-fofn") == 0) {
+	    fofn = *++argv;
+	    argc--;
+	    from_stdin = 0;
+        } else if (strcasecmp(*argv, "-output") == 0) {
+	    if (NULL == (outfp = fopen(*++argv, "wb"))) {
+		perror(*argv);
+		return 1;
+	    }
+            argc--;
+	} else if (**argv != '-') {
+	    from_stdin = 0;
+	    break;
+        } else {
+            usage();
+        }
+    }
+
+    read_experiment_redirect(redirect);
+
+    if (!from_stdin) {
+	if (fofn) {
+	    FILE *fofn_fp;
+	    char line[8192];
+
+	    if (strcmp(fofn, "stdin") == 0)
+		fofn_fp = stdin;
+	    else
+		fofn_fp = fopen(fofn, "r");
+
+	    if (fofn_fp) {
+		while (fgets(line, 8192, fofn_fp) != NULL) {
+		    char *cp;
+		    if (cp = strchr(line, '\n'))
+			*cp = 0;
+		    if (format == TT_EXP) {
+			infp = open_exp_mfile(line, NULL);
+		    } else {
+			infp = open_trace_mfile(line, NULL);
+		    }
+		    if (NULL == infp) {
+			perror(line);
+			ret = 1;
+		    } else {
+			ret |= do_trans(infp, line, outfp, format, good_only,
+					clip_cosmid, fasta_out);
+			mfclose(infp);
+		    }
+		}
+		fclose(fofn_fp);
+	    }
+	}
+	for (;argc > 0; argc--, argv++) {
+	    if (format == TT_EXP) {
+		infp = open_exp_mfile(*argv, NULL);
+	    } else {
+		infp = open_trace_mfile(*argv, NULL);
+	    }
+	    if (NULL == infp) {
+		perror(*argv);
+		ret = 1;
+	    } else {
+		ret |= do_trans(infp, *argv, outfp, format, good_only,
+				clip_cosmid, fasta_out);
+		mfclose(infp);
+	    }
+	}
+    } else {
+	ret = do_trans(infp, "<stdin>", outfp, format, good_only, clip_cosmid,
+		       fasta_out);
+    }
+
+    return ret;
+}
+