Mercurial > repos > dawe > srf2fastq

diff srf2fastq/io_lib-1.12.2/progs/srf2fastq.c @ 0:d901c9f41a6a default tip
Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author: dawe
date: Tue, 07 Jun 2011 17:48:05 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/srf2fastq/io_lib-1.12.2/progs/srf2fastq.c	Tue Jun 07 17:48:05 2011 -0400
@@ -0,0 +1,649 @@
+/*
+ * ======================================================================
+ * This software has been created by Genome Research Limited (GRL).
+ *
+ * GRL hereby grants permission to use, copy, modify and distribute
+ * this software and its documentation for non-commercial purposes
+ * without fee at the user's own risk on the basis set out below.
+ *
+ * GRL neither undertakes nor accepts any duty whether contractual or
+ * otherwise in connection with the software, its use or the use of
+ * any derivative, and makes no representations or warranties, express
+ * or implied, concerning the software, its suitability, fitness for
+ * a particular purpose or non-infringement.
+ *
+ * In no event shall the authors of the software or GRL be responsible
+ * or liable for any loss or damage whatsoever arising in any way
+ * directly or indirectly out of the use of this software or its
+ * derivatives, even if advised of the possibility of such damage.
+ *
+ * Our software can be freely distributed under the conditions set out
+ * above, and must contain this copyright notice.
+ * ======================================================================
+ */
+
+/*
+ * This performs a linear (non-indexed) search for a trace in an SRF archive.
+ *
+ * It's not intended as a suitable production program or as a library of code
+ * to use, but as a test and benchmark statistic.
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include <io_lib/Read.h>
+#include <io_lib/misc.h>
+#include <io_lib/ztr.h>
+#include <io_lib/srf.h>
+#include <io_lib/hash_table.h>
+
+#define MAX_REGIONS   40
+
+/* regn chunk */
+typedef struct {
+    char coord;
+    char *region_names;
+    int nregions;
+    char *name[MAX_REGIONS];
+    char code[MAX_REGIONS];
+    int start[MAX_REGIONS];
+    int length[MAX_REGIONS];
+    int index[MAX_REGIONS];
+    FILE *file[MAX_REGIONS];
+    int count;
+} regn_t;
+
+/*
+ * Inline reverse a string
+ */
+static void reverse_string( char *s , int length ) {
+    char temp;
+    char *first = s;
+    char *last  = s + length - 1;
+
+    /* Reverse complement */
+    while ( last > first ) {
+	temp = *first;
+	*first++ = *last;
+	*last-- = temp;
+    }
+}
+
+/*
+ * Reverse complement a DNA string.
+ */
+static void reverse_complement( char *s , int length ) {
+    char temp;
+    char *first = s;
+    char *last  = s + length - 1;
+    static unsigned char cbase[256];
+    static int init = 0;
+
+    /* Initialise cbase[] array on first use */
+    if (!init) {
+	int i;
+	for (i = 0; i < 256; i++)
+	    cbase[i] = i;
+	cbase['A'] = 'T'; cbase['a'] = 't';
+	cbase['C'] = 'G'; cbase['c'] = 'g';
+	cbase['G'] = 'C'; cbase['g'] = 'c';
+	cbase['T'] = 'A'; cbase['t'] = 'a';
+
+	init = 1;
+    }
+
+    /* Reverse complement */
+    while ( last > first ) {
+	temp = *first;
+	*first++ = cbase[(unsigned char)*last];
+	*last-- = cbase[(unsigned char)temp];
+    }
+
+    if (last == first)
+    	*first = cbase[(unsigned char)*first];
+}
+
+static char qlookup[256];
+void init_qlookup(void) {
+    int i;
+    for (i = -128; i < 128; i++) {
+        qlookup[i+128] = '!' + (int)((10*log(1+pow(10, i/10.0))/log(10)+.499));
+    }
+}
+
+/*
+ * Parse the REGN chunk, add to regn HASH
+ *
+ * Returns corresponding HashItem * from regn Hash
+ */
+HashItem *parse_regn(ztr_t *z, ztr_chunk_t *chunk, HashTable *regn_hash) {
+    char key[1024];
+    char *name;
+    HashItem *hi;
+    regn_t *regn;
+    size_t l;
+    
+    uncompress_chunk(z, chunk);
+
+    /* the hash key is a combination of the region names and boundaries */
+    name = ztr_lookup_mdata_value(z, chunk, "NAME");
+    l = snprintf(key, sizeof(key), "names=%s", name);
+    if( chunk->dlength ){
+        int nbndy = (chunk->dlength-1)/4;
+        uint4 *bndy = (uint4 *)(chunk->data+1);
+        int ibndy;
+	for (ibndy=0; ibndy<nbndy; ibndy++) {
+            if( ibndy )
+                l += snprintf(key + l, sizeof(key) - l,
+			      ";%d", be_int4(bndy[ibndy]));
+            else
+                l += snprintf(key + l, sizeof(key) - l,
+			      " boundaries=%d", be_int4(bndy[ibndy]));
+        }
+    }
+
+    if (NULL == (hi = (HashTableSearch(regn_hash, key, strlen(key))))) {
+        int iregion, nregions = 0, index = 1;
+        char *coord;
+	char *cp1;
+        uint4 bndy[MAX_REGIONS];
+        int ibndy, nbndy = 0;
+        HashData hd;
+
+        if( NULL == (regn = (regn_t *)malloc(sizeof(regn_t)))) {
+	    return NULL;
+	}
+
+	coord = ztr_lookup_mdata_value(z, chunk, "COORD");
+	regn->coord = (NULL == coord ? 'B' : *coord );
+
+	regn->region_names = strdup(name);
+
+        cp1 = strtok (regn->region_names,";");
+        while(cp1) {
+            char *cp2;
+            if(NULL == (cp2 = strchr(cp1,':'))) {
+                fprintf(stderr, "Invalid region name/code pair %s\n", cp1);
+                return NULL;
+            }
+            *cp2++ = '\0';
+            regn->name[nregions] = cp1;
+            regn->code[nregions] = *cp2;
+            nregions++;
+            cp1 = strtok (NULL, ";");
+        }
+
+        regn->nregions = nregions;
+
+	if( chunk->dlength ) {
+            nbndy = (chunk->dlength-1)/4;
+            memcpy(bndy, chunk->data+1, chunk->dlength-1);
+	}
+
+        for( iregion=0, ibndy=0; iregion<nregions; iregion++) {
+            /* start = (start + length of previous region) or 0 if no previous region */
+            /* length = (next boundary - start of region) or -1 if no next boundary */
+            if( regn->code[iregion] == 'E' ){
+                /* not in BASE chunk, no boundary, set length = 0 */
+                regn->start[iregion] = (iregion ? (regn->start[iregion-1] + regn->length[iregion-1]) : 0);
+                regn->length[iregion] = 0;
+            } else {
+                if( ibndy > nbndy ){
+                    fprintf(stderr, "More name/code pairs than boundaries\n");
+                    return NULL;
+                }
+                regn->start[iregion] = (iregion ? (regn->start[iregion-1] + regn->length[iregion-1]) : 0);
+                regn->length[iregion] = (ibndy == nbndy ? -1 : (be_int4(bndy[ibndy])-regn->start[iregion]));
+                regn->index[iregion] = index;
+                ibndy++;
+                index++;
+            }
+        }
+
+        regn->count = 1;
+            
+	hd.p = regn;
+	if (NULL == (hi = HashTableAdd(regn_hash, key, strlen(key), hd, NULL))) {
+	    free(regn->region_names);
+	    free(regn);
+	    return NULL;
+	}
+    } else {
+	regn = (regn_t *)(hi->data.p);
+	regn->count++;
+    }
+
+    return hi;
+}
+
+/* ------------------------------------------------------------------------ */
+#define MAX_READ_LEN 10000
+void ztr2fastq(ztr_t *z, char *name, int calibrated, int sequential,
+               int split, char *root, int numeric, int append, int explicit,
+               HashTable *regn_hash, int *nfiles_open, char **filenames,
+	       FILE **files, int *reverse) {
+    int i, nc, seq_len, nfiles = *nfiles_open;
+    char buf[MAX_READ_LEN*2 + 512 + 6];
+    char *seq, *qual, *sdata, *qdata, *key;
+    ztr_chunk_t **chunks;
+    HashItem *hi;
+    regn_t *regn;
+    int logodds;
+
+    if ( sequential || split || explicit ) {
+        chunks = ztr_find_chunks(z, ZTR_TYPE_REGN, &nc);
+        if (nc != 1) {
+            fprintf(stderr, "Zero or greater than one REGN chunks found.\n");
+            if (chunks)
+                free(chunks);
+            return;
+        }
+        if( NULL == (hi = parse_regn(z, chunks[0], regn_hash)) ){
+            fprintf(stderr, "Invalid RGEN chunk\n");
+            if (chunks)
+                free(chunks);
+            return;
+        }
+        regn = (regn_t *)(hi->data.p);
+        if( regn->count == 1 ){
+            int iregion;
+            for (iregion=0; iregion<regn->nregions; iregion++) {
+                if( regn->code[iregion] == 'E' ) {
+                    /* not in BASE chunk, the name of region IS the sequence, set file = NULL */
+                    regn->file[iregion] = NULL;
+                } else if( split ){
+                    char filename[FILENAME_MAX];
+                    int ifile;
+                    if( numeric ){
+                        sprintf(filename, "%s_%d.fastq", root,
+				regn->index[iregion]);
+                    } else {
+                        sprintf(filename, "%s_%s.fastq", root,
+				regn->name[iregion]);
+                    }
+                    for (ifile=0; ifile<nfiles; ifile++) {
+                        if( 0 == strcmp(filename,filenames[ifile]) ){
+                            regn->file[iregion] = files[ifile];
+                            break;
+                        }
+                    }
+                    if( ifile == nfiles ){
+                        FILE *fp;
+                        if (nfiles == MAX_REGIONS) {
+                            fprintf(stderr, "Too many regions.\n");
+                            if (chunks)
+                                free(chunks);
+                            return;
+                        }
+                        printf("Opening file %s\n", filename);
+                        filenames[nfiles] = strdup(filename);
+                        if (NULL == (fp = fopen(filename, "wb+"))) {
+                            perror(filename);
+                            if (chunks)
+                                free(chunks);
+                            return;
+                        }
+                        files[nfiles++] = fp;
+                        regn->file[iregion] = fp;
+                    }
+                } else {
+                    regn->file[iregion] = stdout;
+                }
+            }
+        }
+
+        if (chunks)
+            free(chunks);
+    }
+
+    /* Extract the sequence only */
+    chunks = ztr_find_chunks(z, ZTR_TYPE_BASE, &nc);
+    if (nc != 1) {
+	fprintf(stderr, "Zero or greater than one BASE chunks found.\n");
+	if (chunks)
+	    free(chunks);
+	return;
+    }
+    uncompress_chunk(z, chunks[0]);
+    sdata = chunks[0]->data+1;
+    seq_len = chunks[0]->dlength-1;
+
+    /* Extract the quality */
+    free(chunks);
+    if (calibrated)
+	chunks = ztr_find_chunks(z, ZTR_TYPE_CNF1, &nc);
+    else
+	chunks = ztr_find_chunks(z, ZTR_TYPE_CNF4, &nc);
+
+    if (nc != 1) {
+	fprintf(stderr, "Zero or greater than one CNF chunks found.\n");
+	if (chunks)
+	    free(chunks);
+	return;
+    }
+    uncompress_chunk(z, chunks[0]);
+    qdata = chunks[0]->data+1;
+    key = ztr_lookup_mdata_value(z, chunks[0], "SCALE");
+    logodds = (key && 0 == strcmp(key, "LO")) ? 1 : 0;
+
+    /* Construct fastq entry */
+    if( sequential || split ){
+        int iregion;
+        for (iregion=0;
+	     iregion<regn->nregions && iregion<MAX_REGIONS;
+	     iregion++) {
+            char *cp = name;
+            int start, length;
+
+            if( regn->code[iregion] == 'E' ) {
+                /*
+		 * Not in BASE chunk, the sequence IS the name of the region
+		 * which may be pre-pended to the next region
+		 */
+                continue;
+            }
+            
+
+            start = regn->start[iregion];
+            length = (regn->length[iregion] == -1
+		      ? (seq_len-regn->start[iregion])
+		      : regn->length[iregion]);
+
+            seq = buf;
+            *seq++ = '@';
+            while (*cp)
+                *seq++ = *cp++;
+            if( append ){
+                int n = sprintf(seq,"/%d", regn->index[iregion]);
+                if( n < 0 ){
+                    fprintf(stderr, "Unable to add index to read name\n");
+                    if (chunks)
+                        free(chunks);
+                    return;
+                }
+                seq += n;
+            }
+            *seq++ = '\n';
+            qual = seq + length;
+
+            if( explicit && iregion && regn->code[iregion-1] == 'E' ) {
+                /*
+		 * previous region not in BASE chunk, the name of region
+		 * IS the sequence which is pre-pended to this region
+		 */
+                qual += strlen(regn->name[iregion-1]);
+            }
+
+            *qual++ = '\n';
+            *qual++ = '+';
+            *qual++ = '\n';
+            
+            if( explicit && iregion && regn->code[iregion-1] == 'E' ){
+                /*
+		 * previous region not in BASE chunk, the name of region
+		 * IS the sequence which is pre-pended to this region
+		 *
+		 * The idea of adding the sequence to the quality string
+		 * here seems very odd. However so far we have only seen
+		 * SOLiD files using explicit regions and in these their
+		 * own fastqs appear to have the DNA base prepended to
+		 * both the colour space sequence and quality strings.
+		 *
+		 * NB: we don't allow this to be reversed.
+		 */
+                strcpy(seq, regn->name[iregion-1]);
+                seq += strlen(regn->name[iregion-1]);
+                strcpy(qual, regn->name[iregion-1]);
+                qual += strlen(regn->name[iregion-1]);
+            }
+            
+	    /* If this is a region to be reversed, do so */
+	    if ( reverse[iregion] ) {
+		reverse_complement(sdata ,length);
+		reverse_string(qdata, length);
+            }
+
+            for (i = 0; i < length; i++) {
+                if (*sdata != '.') {
+                    *seq++ = *sdata++;
+                } else {
+                    *seq++ = 'N';
+                    sdata++;
+                }
+                *qual++ = logodds ? qlookup[*qdata++ + 128] : *qdata++ + '!';
+            }
+            *qual++ = '\n';
+
+            fwrite(buf, 1, qual - buf, regn->file[iregion]);
+        }
+    } else {
+        seq = buf;
+        *seq++ = '@';
+        while (*name)
+            *seq++ = *name++;
+        *seq++ = '\n';
+        qual = seq + seq_len;
+
+        if( explicit ){
+            int iregion;
+            for (iregion=0; iregion<regn->nregions; iregion++) {
+                if( regn->code[iregion] == 'E' ) {
+                    /*
+		     * region not in BASE chunk, the name of region IS
+		     * the sequence
+		     */
+                    qual += strlen(regn->name[iregion]);
+                }
+	    }
+        }
+
+        *qual++ = '\n';
+        *qual++ = '+';
+        *qual++ = '\n';
+
+        if( explicit ){
+            int iregion;
+            for (iregion=0; iregion<regn->nregions; iregion++) {
+                int start, length;
+                if( regn->code[iregion] == 'E' ){
+                    /*
+		     * region not in BASE chunk, the name of region IS
+		     * the sequence
+		     *
+		     * The idea of adding the sequence to the quality string
+		     * here seems very odd. However so far we have only seen
+		     * SOLiD files using explicit regions and in these their
+		     * own fastqs appear to have the DNA base prepended to
+		     * both the colour space sequence and quality strings.
+		     */
+                    strcpy(seq, regn->name[iregion]);
+                    seq += strlen(regn->name[iregion]);
+                    strcpy(qual, regn->name[iregion]);
+                    qual += strlen(regn->name[iregion]);
+                } else {
+                    start = regn->start[iregion];
+                    length = (regn->length[iregion] == -1
+			      ? (seq_len-regn->start[iregion])
+			      : regn->length[iregion]);
+
+                    /* If this is a region to be reversed, do so */
+                    if ( reverse[iregion] ) {
+			reverse_complement(sdata, length);
+			reverse_string(qdata, length);
+                    }
+
+                    for (i = 0; i < length; i++) {
+                        if (*sdata != '.') {
+                            *seq++ = *sdata++;
+                        } else {
+                            *seq++ = 'N';
+                            sdata++;
+                        }
+                        *qual++ = logodds
+			    ? qlookup[*qdata++ + 128]
+			    : *qdata++ + '!';
+                    }
+                }
+            }
+        } else {
+	    if ( reverse[0] ) {
+		reverse_complement(sdata, seq_len);
+		reverse_string(qdata, seq_len);
+	    }
+
+            for (i = 0; i < seq_len; i++) {
+                if (*sdata != '.') {
+                    *seq++ = *sdata++;
+                } else {
+                    *seq++ = 'N';
+                    sdata++;
+                }
+                *qual++ = logodds ? qlookup[*qdata++ + 128] : *qdata++ + '!';
+            }
+        }
+
+        *qual++ = '\n';
+
+        fwrite(buf, 1, qual - buf, stdout);
+    }
+    
+    *nfiles_open = nfiles;
+
+    free(chunks);
+}
+
+/* ------------------------------------------------------------------------ */
+void usage(void) {
+    fprintf(stderr, "Usage: srf2fastq [-c] [-C] [-s root] [-n] [-p] archive_name ...\n");
+    fprintf(stderr, "\n");
+    fprintf(stderr, "       -c       Use calibrated quality values (CNF1)\n");
+    fprintf(stderr, "       -C       Ignore bad reads\n");
+    fprintf(stderr, "\n");
+    fprintf(stderr, "       -s root  Split the fastq files, one for each region in the REGN chunk.\n");
+    fprintf(stderr, "                The files are named root_ + the name of the region.\n");
+    fprintf(stderr, "       -S       Sequentially display regions rather than append them into\n");
+    fprintf(stderr, "                one long read. (conflicts with -s)\n");
+    fprintf(stderr, "\n");
+    fprintf(stderr, "       -n       Ignore REGN names: use region index.\n");
+    fprintf(stderr, "                i.e. root_1, root_2 etc.\n");
+    fprintf(stderr, "       -a       Append region index to name\n");
+    fprintf(stderr, "                i.e. name/1, name/2 etc.\n");
+    fprintf(stderr, "       -e       Include explicit sequence: the names of the regions of type 'E'\n");
+    fprintf(stderr, "\n");
+    fprintf(stderr, "       -r 1,2.. In a comma seperated list, specify which regions to reverse,\n");
+    fprintf(stderr, "                counting from 1. This will reverse complement the read and\n");
+    fprintf(stderr, "                reverse the quality scores. (requires -s or -S)\n");
+    exit(1);
+}
+
+int main(int argc, char **argv) {
+    int calibrated = 0;
+    int mask = 0, i;
+    int sequential = 0;
+    int split = 0;
+    int numeric = 0;
+    int append = 0;
+    int explicit = 0;
+    char root[FILENAME_MAX];
+    int nfiles_open = 0;
+    char *filenames[MAX_REGIONS];
+    FILE *files[MAX_REGIONS];
+    int reverse[MAX_REGIONS], reverse_set = 0;
+
+    memset(reverse, 0, MAX_REGIONS * sizeof(int));
+
+    /* Parse args */
+    for (i = 1; i < argc && argv[i][0] == '-'; i++) {
+	if (!strcmp(argv[i], "-")) {
+	    break;
+	} else if (!strcmp(argv[i], "-C")) {
+	    mask = SRF_READ_FLAG_BAD_MASK;
+	} else if (!strcmp(argv[i], "-c")) {
+	    calibrated = 1;
+	} else if (!strcmp(argv[i], "-s")) {
+            split = 1;
+            strcpy(root, argv[++i]);
+	} else if (!strcmp(argv[i], "-S")) {
+            sequential = 1;
+	} else if (!strcmp(argv[i], "-n")) {
+            numeric = 1;
+	} else if (!strcmp(argv[i], "-a")) {
+            append = 1;
+	} else if (!strcmp(argv[i], "-e")) {
+            explicit = 1;
+        } else if (!strcmp(argv[i], "-r")) {
+	    char *cp, *cpend;
+
+            /* Figure out which ends to reverse */
+	    if (++i == argc)
+		usage();
+
+	    cp = argv[i];
+	    do {
+		long l = (int)strtol(cp, &cpend, 10);
+		if (cpend - cp && l >= 1 && l <= MAX_REGIONS)
+		    reverse[l-1] = 1;
+		cp = cpend+1;
+	    } while (*cpend);
+
+	    reverse_set = 1;
+	} else {
+	    usage();
+	}
+    }    
+
+    if (i == argc) {
+	usage();
+    }
+
+    if ( sequential && split ) {
+        fprintf(stderr, "ERROR: Parameters -s and -S conflict!\n");
+        usage();
+    }
+
+    if ( reverse_set && ! (sequential || split) ) {
+	fprintf(stderr, "ERROR: The -r parameter is only supported when "
+		"spliting sequences by region.\n");
+	usage();
+    }
+
+    read_sections(READ_BASES);
+    init_qlookup();
+
+#ifdef _WIN32
+    _setmode(_fileno(stdout), _O_BINARY);
+#endif
+
+    for (; i < argc; i++) {
+	char *ar_name;
+	srf_t *srf;
+        HashTable *regn_hash;
+	char name[512];
+	ztr_t *ztr;
+
+	ar_name = argv[i];
+
+	if (NULL == (srf = srf_open(ar_name, "r"))) {
+	    perror(ar_name);
+	    return 4;
+	}
+
+        if (NULL == (regn_hash = HashTableCreate(0,HASH_DYNAMIC_SIZE|HASH_FUNC_JENKINS3))) {
+	    return 1;
+        }
+    
+	while (NULL != (ztr = srf_next_ztr(srf, name, mask))) {
+            ztr2fastq(ztr, name, calibrated, sequential, split, root, numeric,
+		      append, explicit, regn_hash, &nfiles_open, filenames,
+		      files, reverse);
+	    delete_ztr(ztr);
+	}
+
+	srf_destroy(srf, 1);
+    }
+
+    return 0;
+}
author	dawe
date	Tue, 07 Jun 2011 17:48:05 -0400
parents
children