Mercurial > repos > dawe > srf2fastq
diff srf2fastq/io_lib-1.12.2/progs/srf2fastq.c @ 0:d901c9f41a6a default tip
Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author | dawe |
---|---|
date | Tue, 07 Jun 2011 17:48:05 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/srf2fastq/io_lib-1.12.2/progs/srf2fastq.c Tue Jun 07 17:48:05 2011 -0400 @@ -0,0 +1,649 @@ +/* + * ====================================================================== + * This software has been created by Genome Research Limited (GRL). + * + * GRL hereby grants permission to use, copy, modify and distribute + * this software and its documentation for non-commercial purposes + * without fee at the user's own risk on the basis set out below. + * + * GRL neither undertakes nor accepts any duty whether contractual or + * otherwise in connection with the software, its use or the use of + * any derivative, and makes no representations or warranties, express + * or implied, concerning the software, its suitability, fitness for + * a particular purpose or non-infringement. + * + * In no event shall the authors of the software or GRL be responsible + * or liable for any loss or damage whatsoever arising in any way + * directly or indirectly out of the use of this software or its + * derivatives, even if advised of the possibility of such damage. + * + * Our software can be freely distributed under the conditions set out + * above, and must contain this copyright notice. + * ====================================================================== + */ + +/* + * This performs a linear (non-indexed) search for a trace in an SRF archive. + * + * It's not intended as a suitable production program or as a library of code + * to use, but as a test and benchmark statistic. + */ + +#include <stdio.h> +#include <math.h> +#include <string.h> +#include <fcntl.h> + +#include <io_lib/Read.h> +#include <io_lib/misc.h> +#include <io_lib/ztr.h> +#include <io_lib/srf.h> +#include <io_lib/hash_table.h> + +#define MAX_REGIONS 40 + +/* regn chunk */ +typedef struct { + char coord; + char *region_names; + int nregions; + char *name[MAX_REGIONS]; + char code[MAX_REGIONS]; + int start[MAX_REGIONS]; + int length[MAX_REGIONS]; + int index[MAX_REGIONS]; + FILE *file[MAX_REGIONS]; + int count; +} regn_t; + +/* + * Inline reverse a string + */ +static void reverse_string( char *s , int length ) { + char temp; + char *first = s; + char *last = s + length - 1; + + /* Reverse complement */ + while ( last > first ) { + temp = *first; + *first++ = *last; + *last-- = temp; + } +} + +/* + * Reverse complement a DNA string. + */ +static void reverse_complement( char *s , int length ) { + char temp; + char *first = s; + char *last = s + length - 1; + static unsigned char cbase[256]; + static int init = 0; + + /* Initialise cbase[] array on first use */ + if (!init) { + int i; + for (i = 0; i < 256; i++) + cbase[i] = i; + cbase['A'] = 'T'; cbase['a'] = 't'; + cbase['C'] = 'G'; cbase['c'] = 'g'; + cbase['G'] = 'C'; cbase['g'] = 'c'; + cbase['T'] = 'A'; cbase['t'] = 'a'; + + init = 1; + } + + /* Reverse complement */ + while ( last > first ) { + temp = *first; + *first++ = cbase[(unsigned char)*last]; + *last-- = cbase[(unsigned char)temp]; + } + + if (last == first) + *first = cbase[(unsigned char)*first]; +} + +static char qlookup[256]; +void init_qlookup(void) { + int i; + for (i = -128; i < 128; i++) { + qlookup[i+128] = '!' + (int)((10*log(1+pow(10, i/10.0))/log(10)+.499)); + } +} + +/* + * Parse the REGN chunk, add to regn HASH + * + * Returns corresponding HashItem * from regn Hash + */ +HashItem *parse_regn(ztr_t *z, ztr_chunk_t *chunk, HashTable *regn_hash) { + char key[1024]; + char *name; + HashItem *hi; + regn_t *regn; + size_t l; + + uncompress_chunk(z, chunk); + + /* the hash key is a combination of the region names and boundaries */ + name = ztr_lookup_mdata_value(z, chunk, "NAME"); + l = snprintf(key, sizeof(key), "names=%s", name); + if( chunk->dlength ){ + int nbndy = (chunk->dlength-1)/4; + uint4 *bndy = (uint4 *)(chunk->data+1); + int ibndy; + for (ibndy=0; ibndy<nbndy; ibndy++) { + if( ibndy ) + l += snprintf(key + l, sizeof(key) - l, + ";%d", be_int4(bndy[ibndy])); + else + l += snprintf(key + l, sizeof(key) - l, + " boundaries=%d", be_int4(bndy[ibndy])); + } + } + + if (NULL == (hi = (HashTableSearch(regn_hash, key, strlen(key))))) { + int iregion, nregions = 0, index = 1; + char *coord; + char *cp1; + uint4 bndy[MAX_REGIONS]; + int ibndy, nbndy = 0; + HashData hd; + + if( NULL == (regn = (regn_t *)malloc(sizeof(regn_t)))) { + return NULL; + } + + coord = ztr_lookup_mdata_value(z, chunk, "COORD"); + regn->coord = (NULL == coord ? 'B' : *coord ); + + regn->region_names = strdup(name); + + cp1 = strtok (regn->region_names,";"); + while(cp1) { + char *cp2; + if(NULL == (cp2 = strchr(cp1,':'))) { + fprintf(stderr, "Invalid region name/code pair %s\n", cp1); + return NULL; + } + *cp2++ = '\0'; + regn->name[nregions] = cp1; + regn->code[nregions] = *cp2; + nregions++; + cp1 = strtok (NULL, ";"); + } + + regn->nregions = nregions; + + if( chunk->dlength ) { + nbndy = (chunk->dlength-1)/4; + memcpy(bndy, chunk->data+1, chunk->dlength-1); + } + + for( iregion=0, ibndy=0; iregion<nregions; iregion++) { + /* start = (start + length of previous region) or 0 if no previous region */ + /* length = (next boundary - start of region) or -1 if no next boundary */ + if( regn->code[iregion] == 'E' ){ + /* not in BASE chunk, no boundary, set length = 0 */ + regn->start[iregion] = (iregion ? (regn->start[iregion-1] + regn->length[iregion-1]) : 0); + regn->length[iregion] = 0; + } else { + if( ibndy > nbndy ){ + fprintf(stderr, "More name/code pairs than boundaries\n"); + return NULL; + } + regn->start[iregion] = (iregion ? (regn->start[iregion-1] + regn->length[iregion-1]) : 0); + regn->length[iregion] = (ibndy == nbndy ? -1 : (be_int4(bndy[ibndy])-regn->start[iregion])); + regn->index[iregion] = index; + ibndy++; + index++; + } + } + + regn->count = 1; + + hd.p = regn; + if (NULL == (hi = HashTableAdd(regn_hash, key, strlen(key), hd, NULL))) { + free(regn->region_names); + free(regn); + return NULL; + } + } else { + regn = (regn_t *)(hi->data.p); + regn->count++; + } + + return hi; +} + +/* ------------------------------------------------------------------------ */ +#define MAX_READ_LEN 10000 +void ztr2fastq(ztr_t *z, char *name, int calibrated, int sequential, + int split, char *root, int numeric, int append, int explicit, + HashTable *regn_hash, int *nfiles_open, char **filenames, + FILE **files, int *reverse) { + int i, nc, seq_len, nfiles = *nfiles_open; + char buf[MAX_READ_LEN*2 + 512 + 6]; + char *seq, *qual, *sdata, *qdata, *key; + ztr_chunk_t **chunks; + HashItem *hi; + regn_t *regn; + int logodds; + + if ( sequential || split || explicit ) { + chunks = ztr_find_chunks(z, ZTR_TYPE_REGN, &nc); + if (nc != 1) { + fprintf(stderr, "Zero or greater than one REGN chunks found.\n"); + if (chunks) + free(chunks); + return; + } + if( NULL == (hi = parse_regn(z, chunks[0], regn_hash)) ){ + fprintf(stderr, "Invalid RGEN chunk\n"); + if (chunks) + free(chunks); + return; + } + regn = (regn_t *)(hi->data.p); + if( regn->count == 1 ){ + int iregion; + for (iregion=0; iregion<regn->nregions; iregion++) { + if( regn->code[iregion] == 'E' ) { + /* not in BASE chunk, the name of region IS the sequence, set file = NULL */ + regn->file[iregion] = NULL; + } else if( split ){ + char filename[FILENAME_MAX]; + int ifile; + if( numeric ){ + sprintf(filename, "%s_%d.fastq", root, + regn->index[iregion]); + } else { + sprintf(filename, "%s_%s.fastq", root, + regn->name[iregion]); + } + for (ifile=0; ifile<nfiles; ifile++) { + if( 0 == strcmp(filename,filenames[ifile]) ){ + regn->file[iregion] = files[ifile]; + break; + } + } + if( ifile == nfiles ){ + FILE *fp; + if (nfiles == MAX_REGIONS) { + fprintf(stderr, "Too many regions.\n"); + if (chunks) + free(chunks); + return; + } + printf("Opening file %s\n", filename); + filenames[nfiles] = strdup(filename); + if (NULL == (fp = fopen(filename, "wb+"))) { + perror(filename); + if (chunks) + free(chunks); + return; + } + files[nfiles++] = fp; + regn->file[iregion] = fp; + } + } else { + regn->file[iregion] = stdout; + } + } + } + + if (chunks) + free(chunks); + } + + /* Extract the sequence only */ + chunks = ztr_find_chunks(z, ZTR_TYPE_BASE, &nc); + if (nc != 1) { + fprintf(stderr, "Zero or greater than one BASE chunks found.\n"); + if (chunks) + free(chunks); + return; + } + uncompress_chunk(z, chunks[0]); + sdata = chunks[0]->data+1; + seq_len = chunks[0]->dlength-1; + + /* Extract the quality */ + free(chunks); + if (calibrated) + chunks = ztr_find_chunks(z, ZTR_TYPE_CNF1, &nc); + else + chunks = ztr_find_chunks(z, ZTR_TYPE_CNF4, &nc); + + if (nc != 1) { + fprintf(stderr, "Zero or greater than one CNF chunks found.\n"); + if (chunks) + free(chunks); + return; + } + uncompress_chunk(z, chunks[0]); + qdata = chunks[0]->data+1; + key = ztr_lookup_mdata_value(z, chunks[0], "SCALE"); + logodds = (key && 0 == strcmp(key, "LO")) ? 1 : 0; + + /* Construct fastq entry */ + if( sequential || split ){ + int iregion; + for (iregion=0; + iregion<regn->nregions && iregion<MAX_REGIONS; + iregion++) { + char *cp = name; + int start, length; + + if( regn->code[iregion] == 'E' ) { + /* + * Not in BASE chunk, the sequence IS the name of the region + * which may be pre-pended to the next region + */ + continue; + } + + + start = regn->start[iregion]; + length = (regn->length[iregion] == -1 + ? (seq_len-regn->start[iregion]) + : regn->length[iregion]); + + seq = buf; + *seq++ = '@'; + while (*cp) + *seq++ = *cp++; + if( append ){ + int n = sprintf(seq,"/%d", regn->index[iregion]); + if( n < 0 ){ + fprintf(stderr, "Unable to add index to read name\n"); + if (chunks) + free(chunks); + return; + } + seq += n; + } + *seq++ = '\n'; + qual = seq + length; + + if( explicit && iregion && regn->code[iregion-1] == 'E' ) { + /* + * previous region not in BASE chunk, the name of region + * IS the sequence which is pre-pended to this region + */ + qual += strlen(regn->name[iregion-1]); + } + + *qual++ = '\n'; + *qual++ = '+'; + *qual++ = '\n'; + + if( explicit && iregion && regn->code[iregion-1] == 'E' ){ + /* + * previous region not in BASE chunk, the name of region + * IS the sequence which is pre-pended to this region + * + * The idea of adding the sequence to the quality string + * here seems very odd. However so far we have only seen + * SOLiD files using explicit regions and in these their + * own fastqs appear to have the DNA base prepended to + * both the colour space sequence and quality strings. + * + * NB: we don't allow this to be reversed. + */ + strcpy(seq, regn->name[iregion-1]); + seq += strlen(regn->name[iregion-1]); + strcpy(qual, regn->name[iregion-1]); + qual += strlen(regn->name[iregion-1]); + } + + /* If this is a region to be reversed, do so */ + if ( reverse[iregion] ) { + reverse_complement(sdata ,length); + reverse_string(qdata, length); + } + + for (i = 0; i < length; i++) { + if (*sdata != '.') { + *seq++ = *sdata++; + } else { + *seq++ = 'N'; + sdata++; + } + *qual++ = logodds ? qlookup[*qdata++ + 128] : *qdata++ + '!'; + } + *qual++ = '\n'; + + fwrite(buf, 1, qual - buf, regn->file[iregion]); + } + } else { + seq = buf; + *seq++ = '@'; + while (*name) + *seq++ = *name++; + *seq++ = '\n'; + qual = seq + seq_len; + + if( explicit ){ + int iregion; + for (iregion=0; iregion<regn->nregions; iregion++) { + if( regn->code[iregion] == 'E' ) { + /* + * region not in BASE chunk, the name of region IS + * the sequence + */ + qual += strlen(regn->name[iregion]); + } + } + } + + *qual++ = '\n'; + *qual++ = '+'; + *qual++ = '\n'; + + if( explicit ){ + int iregion; + for (iregion=0; iregion<regn->nregions; iregion++) { + int start, length; + if( regn->code[iregion] == 'E' ){ + /* + * region not in BASE chunk, the name of region IS + * the sequence + * + * The idea of adding the sequence to the quality string + * here seems very odd. However so far we have only seen + * SOLiD files using explicit regions and in these their + * own fastqs appear to have the DNA base prepended to + * both the colour space sequence and quality strings. + */ + strcpy(seq, regn->name[iregion]); + seq += strlen(regn->name[iregion]); + strcpy(qual, regn->name[iregion]); + qual += strlen(regn->name[iregion]); + } else { + start = regn->start[iregion]; + length = (regn->length[iregion] == -1 + ? (seq_len-regn->start[iregion]) + : regn->length[iregion]); + + /* If this is a region to be reversed, do so */ + if ( reverse[iregion] ) { + reverse_complement(sdata, length); + reverse_string(qdata, length); + } + + for (i = 0; i < length; i++) { + if (*sdata != '.') { + *seq++ = *sdata++; + } else { + *seq++ = 'N'; + sdata++; + } + *qual++ = logodds + ? qlookup[*qdata++ + 128] + : *qdata++ + '!'; + } + } + } + } else { + if ( reverse[0] ) { + reverse_complement(sdata, seq_len); + reverse_string(qdata, seq_len); + } + + for (i = 0; i < seq_len; i++) { + if (*sdata != '.') { + *seq++ = *sdata++; + } else { + *seq++ = 'N'; + sdata++; + } + *qual++ = logodds ? qlookup[*qdata++ + 128] : *qdata++ + '!'; + } + } + + *qual++ = '\n'; + + fwrite(buf, 1, qual - buf, stdout); + } + + *nfiles_open = nfiles; + + free(chunks); +} + +/* ------------------------------------------------------------------------ */ +void usage(void) { + fprintf(stderr, "Usage: srf2fastq [-c] [-C] [-s root] [-n] [-p] archive_name ...\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -c Use calibrated quality values (CNF1)\n"); + fprintf(stderr, " -C Ignore bad reads\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -s root Split the fastq files, one for each region in the REGN chunk.\n"); + fprintf(stderr, " The files are named root_ + the name of the region.\n"); + fprintf(stderr, " -S Sequentially display regions rather than append them into\n"); + fprintf(stderr, " one long read. (conflicts with -s)\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -n Ignore REGN names: use region index.\n"); + fprintf(stderr, " i.e. root_1, root_2 etc.\n"); + fprintf(stderr, " -a Append region index to name\n"); + fprintf(stderr, " i.e. name/1, name/2 etc.\n"); + fprintf(stderr, " -e Include explicit sequence: the names of the regions of type 'E'\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -r 1,2.. In a comma seperated list, specify which regions to reverse,\n"); + fprintf(stderr, " counting from 1. This will reverse complement the read and\n"); + fprintf(stderr, " reverse the quality scores. (requires -s or -S)\n"); + exit(1); +} + +int main(int argc, char **argv) { + int calibrated = 0; + int mask = 0, i; + int sequential = 0; + int split = 0; + int numeric = 0; + int append = 0; + int explicit = 0; + char root[FILENAME_MAX]; + int nfiles_open = 0; + char *filenames[MAX_REGIONS]; + FILE *files[MAX_REGIONS]; + int reverse[MAX_REGIONS], reverse_set = 0; + + memset(reverse, 0, MAX_REGIONS * sizeof(int)); + + /* Parse args */ + for (i = 1; i < argc && argv[i][0] == '-'; i++) { + if (!strcmp(argv[i], "-")) { + break; + } else if (!strcmp(argv[i], "-C")) { + mask = SRF_READ_FLAG_BAD_MASK; + } else if (!strcmp(argv[i], "-c")) { + calibrated = 1; + } else if (!strcmp(argv[i], "-s")) { + split = 1; + strcpy(root, argv[++i]); + } else if (!strcmp(argv[i], "-S")) { + sequential = 1; + } else if (!strcmp(argv[i], "-n")) { + numeric = 1; + } else if (!strcmp(argv[i], "-a")) { + append = 1; + } else if (!strcmp(argv[i], "-e")) { + explicit = 1; + } else if (!strcmp(argv[i], "-r")) { + char *cp, *cpend; + + /* Figure out which ends to reverse */ + if (++i == argc) + usage(); + + cp = argv[i]; + do { + long l = (int)strtol(cp, &cpend, 10); + if (cpend - cp && l >= 1 && l <= MAX_REGIONS) + reverse[l-1] = 1; + cp = cpend+1; + } while (*cpend); + + reverse_set = 1; + } else { + usage(); + } + } + + if (i == argc) { + usage(); + } + + if ( sequential && split ) { + fprintf(stderr, "ERROR: Parameters -s and -S conflict!\n"); + usage(); + } + + if ( reverse_set && ! (sequential || split) ) { + fprintf(stderr, "ERROR: The -r parameter is only supported when " + "spliting sequences by region.\n"); + usage(); + } + + read_sections(READ_BASES); + init_qlookup(); + +#ifdef _WIN32 + _setmode(_fileno(stdout), _O_BINARY); +#endif + + for (; i < argc; i++) { + char *ar_name; + srf_t *srf; + HashTable *regn_hash; + char name[512]; + ztr_t *ztr; + + ar_name = argv[i]; + + if (NULL == (srf = srf_open(ar_name, "r"))) { + perror(ar_name); + return 4; + } + + if (NULL == (regn_hash = HashTableCreate(0,HASH_DYNAMIC_SIZE|HASH_FUNC_JENKINS3))) { + return 1; + } + + while (NULL != (ztr = srf_next_ztr(srf, name, mask))) { + ztr2fastq(ztr, name, calibrated, sequential, split, root, numeric, + append, explicit, regn_hash, &nfiles_open, filenames, + files, reverse); + delete_ztr(ztr); + } + + srf_destroy(srf, 1); + } + + return 0; +}