Mercurial > repos > dawe > srf2fastq
diff srf2fastq/io_lib-1.12.2/io_lib/scf_extras.c @ 0:d901c9f41a6a default tip
Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author | dawe |
---|---|
date | Tue, 07 Jun 2011 17:48:05 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/srf2fastq/io_lib-1.12.2/io_lib/scf_extras.c Tue Jun 07 17:48:05 2011 -0400 @@ -0,0 +1,241 @@ +/* + * Copyright (c) Medical Research Council 1998. All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation for any purpose is hereby granted without fee, provided that + * this copyright and notice appears in all copies and that credit is given + * where due. + * + * This file was written by James Bonfield, Simon Dear, Rodger Staden, + * Kathryn Beal, as part of the Staden Package at the MRC Laboratory of + * Molecular Biology, Hills Road, Cambridge, CB2 2QH, United Kingdom. + * + * MRC disclaims all warranties with regard to this software. + */ + +/* + * This file contains the necessary code for reading the quality values from + * an SCF file. It supports both V2 and V3 SCF formats. + * It's done in an efficient manner by extracting only the relevant SCF + * components. + * This file is derived from the Gap4 source file scf_extras.c. + */ + +#include <stdlib.h> + +#include "io_lib/stdio_hack.h" +#include "io_lib/compress.h" +#include "io_lib/misc.h" +#include "io_lib/scf.h" +#include "io_lib/expFileIO.h" +#include "io_lib/traceType.h" +#include "io_lib/open_trace_file.h" +#include "io_lib/scf_extras.h" + +/* + * --------------------------------------------------------------------------- + * Loads confidence values from the trace file and averages them. + * 'opos' is optional - if not known then set to NULL. + * + * Returns 0 for success + * -1 for failure + */ +int get_read_conf(Exp_info *e, int length, int2 *opos, int1 *conf) { + int ttype, i; + FILE *fp; + uint_1 *prob_A, *prob_C, *prob_G, *prob_T; + char *seq; + float scf_version; + int nbases = 0; + + /* Sanity check */ + if (!(exp_Nentries(e,EFLT_LT) && exp_Nentries(e,EFLT_LN))) + return -1; + + /* Find and load trace file */ + ttype = trace_type_str2int(exp_get_entry(e, EFLT_LT)); + + if (ttype != TT_SCF && + ttype != TT_CTF && + ttype != TT_ZTR) + return -1; + + /* + * We only support direct reading accuracy values from SCF files. + * Otherwise we have to take a slower approach. + */ + if (ttype != TT_SCF) { + Read *r; + int sec = read_sections(0); + read_sections(READ_BASES); + + if (NULL == (r = read_reading(exp_get_entry(e,EFLT_LN), TT_ANYTR))) { + read_sections(sec); + return -1; + } + + prob_A = (int1 *)xmalloc(r->NBases); + prob_C = (int1 *)xmalloc(r->NBases); + prob_G = (int1 *)xmalloc(r->NBases); + prob_T = (int1 *)xmalloc(r->NBases); + seq = (char *)xmalloc(r->NBases); + + memcpy(prob_A, r->prob_A, r->NBases); + memcpy(prob_C, r->prob_C, r->NBases); + memcpy(prob_G, r->prob_G, r->NBases); + memcpy(prob_T, r->prob_T, r->NBases); + memcpy(seq, r->base, r->NBases); + + nbases = r->NBases; + + read_deallocate(r); + read_sections(sec); + + } else { + Header h; + /* For SCF files we read directly - the above code would also do. */ + + if (NULL == (fp = open_trace_file(exp_get_entry(e,EFLT_LN), NULL))) + return -1; + + /* Read the SCF header */ + if (-1 == read_scf_header(fp, &h)) + return -1; + scf_version = scf_version_str2float(h.version); + nbases = h.bases; + + /* Alloc memory */ + prob_A = (uint_1 *)xmalloc(h.bases * sizeof(*prob_A)); + prob_C = (uint_1 *)xmalloc(h.bases * sizeof(*prob_A)); + prob_G = (uint_1 *)xmalloc(h.bases * sizeof(*prob_A)); + prob_T = (uint_1 *)xmalloc(h.bases * sizeof(*prob_A)); + seq = (char *)xmalloc(h.bases * sizeof(*seq)); + if (NULL == prob_A || + NULL == prob_C || + NULL == prob_G || + NULL == prob_T || + NULL == seq) + return -1; + + /* Load base scores */ + if (scf_version >= 3.0) { + /* + * Version 3 base format: + * num_bases * 4byte peak index + * num_bases * prob_A + * num_bases * prob_C + * num_bases * prob_G + * num_bases * prob_T + * num_bases * base + * num_bases * spare (x3) + */ + fseek(fp, (off_t)h.bases_offset + 4 * h.bases, SEEK_SET); + if (h.bases != fread(prob_A, 1, h.bases, fp)) + return -1; + if (h.bases != fread(prob_C, 1, h.bases, fp)) + return -1; + if (h.bases != fread(prob_G, 1, h.bases, fp)) + return -1; + if (h.bases != fread(prob_T, 1, h.bases, fp)) + return -1; + if (h.bases != fread(seq, 1, h.bases, fp)) + return -1; + } else { + int i; + uint_1 buf[12]; + + /* + * Version 2 base format + * num_bases * base_struct, where base_struct is 12 bytes: + * 0-3 peak_index + * 4-7 prob_A/C/G/T + * 8 base + * 9- spare + */ + fseek(fp, (off_t)h.bases_offset, SEEK_SET); + + for (i = 0; (unsigned)i < h.bases; i++) { + if (1 != fread(buf, 12, 1, fp)) + return -1; + prob_A[i] = buf[4]; + prob_C[i] = buf[5]; + prob_G[i] = buf[6]; + prob_T[i] = buf[7]; + seq[i] = buf[8]; + } + } + + fclose(fp); + } + + /* Determine confidence values */ + if (opos) { + for (i=0; i<length; i++) { + if (opos[i] == 0) { + /* Inserted base, change to 0% */ + conf[i] = 0; + } else { + switch(seq[opos[i]-1]) { + case 'a': + case 'A': + conf[i] = prob_A[opos[i]-1]; + break; + case 'c': + case 'C': + conf[i] = prob_C[opos[i]-1]; + break; + case 'g': + case 'G': + conf[i] = prob_G[opos[i]-1]; + break; + case 't': + case 'T': + conf[i] = prob_T[opos[i]-1]; + break; + default: + conf[i] = 2; + } + } + } + } else { + int mlength = MIN(length, nbases); + + for (i=0; i < mlength; i++) { + switch(seq[i]) { + case 'a': + case 'A': + conf[i] = prob_A[i]; + break; + case 'c': + case 'C': + conf[i] = prob_C[i]; + break; + case 'g': + case 'G': + conf[i] = prob_G[i]; + break; + case 't': + case 'T': + conf[i] = prob_T[i]; + break; + case 'n': + case 'N': + case '-': + conf[i] = (prob_A[i] + prob_C[i] + prob_G[i] + prob_T[i]) / 4; + break; + default: + conf[i] = 2; + } + } + for (; i < length; i++) + conf[i] = 2; + } + + xfree(prob_A); + xfree(prob_C); + xfree(prob_G); + xfree(prob_T); + xfree(seq); + + return 0; +}