Mercurial > repos > dawe > srf2fastq
diff srf2fastq/io_lib-1.12.2/io_lib/sff.c @ 0:d901c9f41a6a default tip
Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author | dawe |
---|---|
date | Tue, 07 Jun 2011 17:48:05 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/srf2fastq/io_lib-1.12.2/io_lib/sff.c Tue Jun 07 17:48:05 2011 -0400 @@ -0,0 +1,380 @@ +/* + * Portions of this code have been derived from 454 Life Sciences Corporation's + * getsff.c code (specifically the WriteSFFFile function. + * It bears the following copyright notice: + * + * ------------------------------------------------------------ + * Copyright (c)[2001-2005] 454 Life Sciences Corporation. All Rights Reserved. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * + * IN NO EVENT SHALL LICENSOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE. + * + * Permission to use, copy, modify and distribute this software and its + * documentation for any purpose is hereby granted without fee, provided + * that this copyright and notice appears in all copies. + * ------------------------------------------------------------ + * + * The remainder is Copyright Genome Research Limited (GRL) and is covered + * by a BSD style license as described elsewhere in this source tree. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "io_lib/Read.h" +#include "io_lib/xalloc.h" +#include "io_lib/sff.h" +#include "io_lib/misc.h" + +/* -------------------------------------------------------------------------*/ +/* General purpose decoding and mFILE reading functions */ + +/* + * Unpacks the 31-byte fixed size part of the SFF common header. + * It allocates memory for this and for the flow order and key, but does + * not read the flow & key information (as this may not be in buf). + * It also checks that the MAGIC and VERSION match as expected. + * + * Returns sff_common_header* on success + * NULL on failure + */ +sff_common_header *decode_sff_common_header(unsigned char *buf) { + sff_common_header *h; + + if (NULL == (h = (sff_common_header *)xcalloc(1, sizeof(*h)))) + return NULL; + + h->magic = be_int4(*(uint32_t *)(buf+0)); + memcpy(h->version, buf+4, 4); + h->index_offset = be_int8(*(uint64_t *)(buf+8)); + h->index_len = be_int4(*(uint32_t *)(buf+16)); + h->nreads = be_int4(*(uint32_t *)(buf+20)); + h->header_len = be_int2(*(uint16_t *)(buf+24)); + h->key_len = be_int2(*(uint16_t *)(buf+26)); + h->flow_len = be_int2(*(uint16_t *)(buf+28)); + h->flowgram_format = be_int1(*(uint8_t *)(buf+30)); + + if (h->magic != SFF_MAGIC || memcmp(h->version, SFF_VERSION, 4)) { + xfree(h); + return NULL; + } + + if (NULL == (h->flow = (char *)xmalloc(h->flow_len))) + return free_sff_common_header(h), NULL; + if (NULL == (h->key = (char *)xmalloc(h->key_len))) + return free_sff_common_header(h), NULL; + + return h; +} + +/* + * Encodes the data in 'h' to the file SFF representation. Buf should be + * allocated to be 31 + h->flow_len + h->key_len + 8. + * + * Returns: the written length of buf + */ +int encode_sff_common_header(sff_common_header *h, unsigned char *buf) { + int end; + + *(uint32_t *)(buf+0) = be_int4(h->magic); + memcpy(buf+4, h->version, 4); + *(uint64_t *)(buf+8) = be_int8(h->index_offset); + *(uint32_t *)(buf+16) = be_int4(h->index_len); + *(uint32_t *)(buf+20) = be_int4(h->nreads); + *(uint16_t *)(buf+24) = be_int2(h->header_len); + *(uint16_t *)(buf+26) = be_int2(h->key_len); + *(uint16_t *)(buf+28) = be_int2(h->flow_len); + *(uint8_t *)(buf+30) = be_int1(h->flowgram_format); + memcpy(buf+31, h->flow, h->flow_len); + memcpy(buf+31+h->flow_len, h->key, h->key_len); + end = 31+h->flow_len+h->key_len; + memcpy(buf+end, "\0\0\0\0\0\0\0\0", ((end+7)&~7)-end); + + return (end+7)&~7; +} + +/* + * Reads a common header (including variable length components) from an mFILE. + * + * Returns the a pointer to the header on success + * NULL on failure + */ +sff_common_header *read_sff_common_header(mFILE *mf) { + sff_common_header *h; + unsigned char chdr[31]; + + if (31 != mfread(chdr, 1, 31, mf)) + return NULL; + h = decode_sff_common_header(chdr); + + if (h->flow_len != mfread(h->flow, 1, h->flow_len, mf)) + return free_sff_common_header(h), NULL; + if (h->key_len != mfread(h->key , 1, h->key_len, mf)) + return free_sff_common_header(h), NULL; + + /* Pad to 8 chars */ + mfseek(mf, (mftell(mf) + 7)& ~7, SEEK_SET); + + return h; +} + +/* + * Deallocates memory used by an SFF common header. + */ +void free_sff_common_header(sff_common_header *h) { + if (!h) + return; + if (h->flow) + xfree(h->flow); + if (h->key) + xfree(h->key); + xfree(h); +} + +/* + * Unpacks the 16-byte fixed size part of the SFF read header. + * It allocates memory for this and for the base calls, but does not + * unpack these. + * + * Returns sff_read_header* on success + * NULL on failure + */ +sff_read_header *decode_sff_read_header(unsigned char *buf) { + sff_read_header *h; + + if (NULL == (h = (sff_read_header *)xcalloc(1, sizeof(*h)))) + return NULL; + + h->header_len = be_int2(*(uint16_t *)(buf+0)); + h->name_len = be_int2(*(uint16_t *)(buf+2)); + h->nbases = be_int4(*(uint32_t *)(buf+4)); + h->clip_qual_left = be_int2(*(uint16_t *)(buf+8)); + h->clip_qual_right = be_int2(*(uint16_t *)(buf+10)); + h->clip_adapter_left = be_int2(*(uint16_t *)(buf+12)); + h->clip_adapter_right = be_int2(*(uint16_t *)(buf+14)); + + if (NULL == (h->name = (char *)xmalloc(h->name_len))) + return free_sff_read_header(h), NULL; + + return h; +} + +/* + * Encodes the data in 'h' to the file SFF representation. Buf should be + * allocated to be 16 + h->name_len + 8. + * + * Returns: the written length of buf + */ +int encode_sff_read_header(sff_read_header *h, unsigned char *buf) { + int end; + + *(uint16_t *)(buf+0) = be_int2(h->header_len); + *(uint16_t *)(buf+2) = be_int2(h->name_len); + *(uint32_t *)(buf+4) = be_int4(h->nbases); + *(uint16_t *)(buf+8) = be_int2(h->clip_qual_left); + *(uint16_t *)(buf+10) = be_int2(h->clip_qual_right); + *(uint16_t *)(buf+12) = be_int2(h->clip_adapter_left); + *(uint16_t *)(buf+14) = be_int2(h->clip_adapter_right); + memcpy(buf+16, h->name, h->name_len); + end = 16+h->name_len; + memcpy(buf+end, "\0\0\0\0\0\0\0\0", ((end+7)&~7)-end); + + return (end+7)&~7; +} + +/* + * Reads a read header (including variable length components) from an mFILE. + * + * Returns the a pointer to the header on success + * NULL on failure + */ +sff_read_header *read_sff_read_header(mFILE *mf) { + sff_read_header *h; + unsigned char rhdr[16]; + + if (16 != mfread(rhdr, 1, 16, mf)) + return NULL; + h = decode_sff_read_header(rhdr); + + if (h->name_len != mfread(h->name, 1, h->name_len, mf)) + return free_sff_read_header(h), NULL; + + /* Pad to 8 chars */ + mfseek(mf, (mftell(mf) + 7)& ~7, SEEK_SET); + + return h; +} + +/* + * Deallocates memory used by an SFF read header + */ +void free_sff_read_header(sff_read_header *h) { + if (!h) + return; + if (h->name) + xfree(h->name); + free(h); +} + +/* + * Reads a read data block from an mFILE given a count of the number of + * flows and basecalls (from the common header and read headers). + * + * Returns the a pointer to sff_read_data on success + * NULL on failure + */ +sff_read_data *read_sff_read_data(mFILE *mf, int nflows, int nbases) { + sff_read_data *d; + int i; + + if (NULL == (d = (sff_read_data *)xcalloc(1, sizeof(*d)))) + return NULL; + + if (NULL == (d->flowgram = (uint16_t *)xcalloc(nflows, 2))) + return free_sff_read_data(d), NULL; + if (nflows != mfread(d->flowgram, 2, nflows, mf)) + return free_sff_read_data(d), NULL; + for (i = 0; i < nflows; i++) + d->flowgram[i] = be_int2(d->flowgram[i]); + + if (NULL == (d->flow_index = (uint8_t *)xmalloc(nbases))) + return free_sff_read_data(d), NULL; + if (nbases != mfread(d->flow_index, 1, nbases, mf)) + return free_sff_read_data(d), NULL; + + if (NULL == (d->bases = (char *)xmalloc(nbases))) + return free_sff_read_data(d), NULL; + if (nbases != mfread(d->bases, 1, nbases, mf)) + return free_sff_read_data(d), NULL; + + if (NULL == (d->quality = (uint8_t *)xmalloc(nbases))) + return free_sff_read_data(d), NULL; + if (nbases != mfread(d->quality, 1, nbases, mf)) + return free_sff_read_data(d), NULL; + + /* Pad to 8 chars */ + mfseek(mf, (mftell(mf) + 7)& ~7, SEEK_SET); + + return d; +} + +/* + * Deallocates memory used by an SFF read data block + */ +void free_sff_read_data(sff_read_data *d) { + if (!d) + return; + if (d->flowgram) + xfree(d->flowgram); + if (d->flow_index) + xfree(d->flow_index); + if (d->bases) + xfree(d->bases); + if (d->quality) + xfree(d->quality); + xfree(d); +} + + +/* -------------------------------------------------------------------------*/ + +/* + * Reads an SFF file from an mFILE and decodes it to a Read struct. + * + * Returns Read* on success + * NULL on failure + */ +Read *mfread_sff(mFILE *mf) { + int i, bpos; + Read *r; + sff_common_header *ch; + sff_read_header *rh; + sff_read_data *rd; + + /* Load the SFF contents */ + if (NULL == (ch = read_sff_common_header(mf))) + return NULL; + if (NULL == (rh = read_sff_read_header(mf))) { + free_sff_common_header(ch); + return NULL; + } + if (NULL == (rd = read_sff_read_data(mf, ch->flow_len, rh->nbases))) { + free_sff_common_header(ch); + free_sff_read_header(rh); + return NULL; + } + + /* Convert to Read struct */ + r = read_allocate(0,0); + if (r->basePos) free(r->basePos); + if (r->base) free(r->base); + if (r->prob_A) free(r->prob_A); + if (r->prob_C) free(r->prob_C); + if (r->prob_G) free(r->prob_G); + if (r->prob_T) free(r->prob_T); + + r->nflows = ch->flow_len; + r->flow_order = ch->flow; ch->flow = NULL; + r->flow_raw = NULL; + r->flow = (float *)malloc(r->nflows * sizeof(float)); + for (i = 0; i < r->nflows; i++) { + r->flow[i] = rd->flowgram[i] / 100.0; + } + + r->NBases = rh->nbases; + r->basePos = (uint_2 *)calloc(r->NBases, 2); + r->base = rd->bases; rd->bases = NULL; + r->prob_A = (char *)calloc(r->NBases, 1); + r->prob_C = (char *)calloc(r->NBases, 1); + r->prob_G = (char *)calloc(r->NBases, 1); + r->prob_T = (char *)calloc(r->NBases, 1); + + bpos = 0; + for (i=0; i < r->NBases; i++) { + r->prob_A[i] = 0; + r->prob_C[i] = 0; + r->prob_G[i] = 0; + r->prob_T[i] = 0; + switch (r->base[i]) { + case 'A': + case 'a': + r->prob_A[i] = rd->quality[i]; + break; + case 'C': + case 'c': + r->prob_C[i] = rd->quality[i]; + break; + case 'G': + case 'g': + r->prob_G[i] = rd->quality[i]; + break; + case 'T': + case 't': + r->prob_T[i] = rd->quality[i]; + break; + } + + bpos += rd->flow_index[i]; + r->basePos[i] = bpos; + } + + r->leftCutoff = MAX(rh->clip_qual_left, rh->clip_adapter_left); + r->rightCutoff = MIN(rh->clip_qual_right + ? rh->clip_qual_right + : r->NBases+1, + rh->clip_adapter_right + ? rh->clip_adapter_right + : r->NBases+1); + + free_sff_common_header(ch); + free_sff_read_header(rh); + free_sff_read_data(rd); + + return r; +}