diff srf2fastq/io_lib-1.12.2/io_lib/seqIOALF.c @ 0:d901c9f41a6a default tip

Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author dawe
date Tue, 07 Jun 2011 17:48:05 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/srf2fastq/io_lib-1.12.2/io_lib/seqIOALF.c	Tue Jun 07 17:48:05 2011 -0400
@@ -0,0 +1,424 @@
+/*
+ * Copyright (c) Medical Research Council 1994. All rights reserved.
+ *
+ * Permission to use, copy, modify and distribute this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * this copyright and notice appears in all copies.
+ *
+ * This file was written by James Bonfield, Simon Dear, Rodger Staden,
+ * as part of the Staden Package at the MRC Laboratory of Molecular
+ * Biology, Hills Road, Cambridge, CB2 2QH, United Kingdom.
+ *
+ * MRC disclaims all warranties with regard to this software.
+ */
+
+/* 
+ * Title:       seqIOALF
+ *
+ * File: 	 seqIOALF.c
+ * Purpose:	 IO of ALF sequences
+ * Last update:  9th September 1994
+ */
+
+/*
+ * Change Log :- 
+ * 14.01.91 SD
+ * when complimenting the sequence with an odd number of bases,
+ * the middle base position was not adjusted.
+ * 15.01.91 SD  Put StLouis stuff on compilation flag
+ * 15.01.91 SD  New include file (opp.h)
+ * 02.08.91 SD  Changes the mapping of uncertainty codes so that we
+ * now only generate A C G T and -
+ * Previously... bug in interpreting ALF integer fields.
+ * We now treat them as unsigned.
+ * 17.09.91 LFW changed STLOUIS compilation flag to SAVE_EDITS
+ * and AUTO_CLIP
+ * 25.10.91 SD  Machine independant I/O...removed BIGENDIAN flag
+ * 25.11.91 SD There was a hard limit (of 1024) for allocation of
+ * space for number of bases, yet program would 
+ * read in more if there were any, causing nasties to happen.
+ * 
+ * 11.11.92 LFW added section to actually check that the trace it
+ * is trying to open is an ALF file using traceType sub
+ * 
+ * 10.11.92 SD  SCF comments now stored in seq data structure
+ * 09.09.94 JKB Update to use Read instead of Seq library.
+ * 04.03.98 JKB Look for "Raw data" when "Processed data" is not found.
+ */
+
+/* RMD I made substantial changes to this file 12/28/90 so as to
+ * read sequence data more freely (necessary when reading data from
+ * multiple trace files).
+ * The affected area is indicated by comments starting RMD, like
+ * this one.
+ */
+
+/* This file was adapted by LFW from seqIOABI.c.
+ * The ALF results file is a concatenation of many files with an
+ * index structure at the beginning, consisting of a 512 byte
+ * block that we ignore, followed by 128 byte blocks describing
+ * each file.  All files, including the header region, are rounded 
+ * up to a multiple of 512 bytes long.  
+ * The getIndexEntry routines identify the 128 byte index component
+ * of interest by matching 4 chars of its ASCII label, then extract
+ * the field of choice from that entry.
+ * 
+ * Note that the SUN and PC are of opposite endian-ness, so that
+ * we have to provide special routines to read words and longwords
+ * from the results file.  Luckily the floating point numbers are
+ * written out in ASCII.
+ */
+
+
+/* ---- Imports ---- */
+
+
+#include <ctype.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "io_lib/stdio_hack.h"
+
+#include "io_lib/Read.h"
+#include "io_lib/mach-io.h"
+#include "io_lib/xalloc.h"
+
+/* ---- Constants ---- */
+
+#define BasesPerLine 50 /* For output formatting */
+
+#define IndexEntryLength ((off_t)128)
+
+
+/*
+ * Here are some labels we will be looking for, four chars packed
+ * into a long word.
+ */
+#define EntryLabel        ((uint_4) ((((('A'<<8)+'L')<<8)+'F')<<8)+' ')
+#define BaseEntryLabel    ((uint_4) ((((('S'<<8)+'e')<<8)+'q')<<8)+'u')
+#define DataEntryLabel    ((uint_4) ((((('P'<<8)+'r')<<8)+'o')<<8)+'c')
+#define RawDataEntryLabel ((uint_4) ((((('R'<<8)+'a')<<8)+'w')<<8)+' ')
+
+/* RMD make enough space for bases - hard limit */
+#define BASELIMIT 4096
+
+
+/* ---- Internal functions ---- */
+
+/*
+ * From the ALF results file connected to `fp' whose index starts
+ * at byte offset `indexO', return in `val' the `lw'th long word
+ * from the entry labelled `label'.
+ * The result is 0 for failure, 1 for success.
+ */
+static int getIndexEntryLW(FILE *fp, off_t indexO,
+			   uint_4 label, int lw,
+			   uint_4 *val) {
+    off_t entryNum=-1;
+    int i;
+    uint_4 entryLabel;
+    
+    do {
+	entryNum++;
+	if (fseek(fp, indexO+(entryNum*IndexEntryLength), 0) != 0) 
+	    return 0;
+	    
+	if (!be_read_int_4(fp, &entryLabel))
+	    return 0;
+    } while (!(entryLabel == label));
+    
+    for(i=2; i<lw; i++)
+	if (!be_read_int_4(fp, val))
+	    return 0;
+    
+    
+    /* when i = lw read in the 4 bytes backwards */
+    if (!le_read_int_4(fp,val))
+	return 0;
+    
+    return 1;
+}
+
+/*
+ * From the ALF results file connected to `fp' whose index starts
+ * at byte offset `indexO', return in `val' the `lw'th  word (int2)
+ * from the entry labelled `label'.
+ * The result is 0 for failure, 1 for success.
+ */
+static int getIndexEntryW(FILE *fp, off_t indexO,
+			  uint_4 label, int lw,
+			  uint_2 *val) {
+    off_t entryNum=-1;
+    int i;
+    uint_4 entryLabel;
+    uint_4 jval;
+    
+    do {
+	entryNum++;
+	if (fseek(fp, indexO+(entryNum*IndexEntryLength), 0) != 0)
+	    return 0;
+
+	if (!be_read_int_4(fp, &entryLabel))
+	    return 0;
+	} while (!(entryLabel == label));
+    
+    
+    for(i=2; i<lw; i++)
+	if (!be_read_int_4(fp, &jval))
+	    return 0;
+
+    if (!le_read_int_2(fp, val))
+	return 0;
+    
+    return 1;
+}
+
+
+/* ---- Exports ---- */
+
+
+/*
+ * Read the ALF format sequence from FILE *fp into a Read structure.
+ * All printing characters (as defined by ANSII C `isprint')
+ * are accepted, but `N's are translated to `-'s. In this respect we
+ * are adhering (more or less) to the CSET_DEFAULT uncertainty code set.
+ * 
+ * Returns:
+ *   Read *	- Success, the Read structure read.
+ *   NULLRead	- Failure.
+ */
+Read *fread_alf(FILE *fp) {
+    Read *read = NULLRead;
+    int i;
+    int numPoints;
+    int sections = read_sections(0);
+    
+    uint_4 data_size;
+    uint_4 dataO;
+    uint_4 header_size=396; /* size of the header of the processed data
+			       section */
+    uint_2 actBaseDataSize; /* actual number of bytes of data of information
+			       containing the base and basePos information */
+    int num_points;         /* keeps track of the actual number of points,
+			       rather than the early guess of numPoints */
+
+    off_t indexO;           /* File offset where the index is */
+    uint_4 baseO;           /* File offset where the bases are stored */
+    
+    
+    /*
+     * RMD lots of changes below here until end of data reading section
+     * Some are cosmetic.
+     * getIndexEntry calls in front of where they were needed, and made
+     * There is a substantive change to the inner loop of the sequence
+     * reading section.  This now uses fscanf - much less rigid than the
+     * previous scheme.  Note that it reads bp as a float.  This is because
+     * it is a float in multiple trace data files! (bizarre Pharmacia
+     * programming!).
+     */
+    
+    
+    /*************************************************************
+     * Read the various file offsets
+     *************************************************************/
+
+    /* indexO is the offset of the index.
+     * Or I could look for the first label, starting 'ALF'
+     * if I used 512 then none of the entries are on long 
+     * word boundaries
+     */
+    indexO = 522;
+    
+    /* offset in file of first base of sequence */
+    if (! (getIndexEntryLW(fp,indexO,BaseEntryLabel,12,&baseO)) )
+	goto bail_out;
+    
+    /* actual size of region containing this data */
+    if (! (getIndexEntryW(fp,indexO,BaseEntryLabel,10,&actBaseDataSize)) )
+	goto bail_out;
+    
+    /* Look for Processed data first. If we fail to find it, then look for
+     * the Raw data (same format).
+     */
+
+    /* offset in file to start of processed data segment - there 
+     * is then a header of size header_size (currently 396)
+     */
+    if (! (getIndexEntryLW(fp,indexO,DataEntryLabel,12,&dataO)) ) {
+	if (! (getIndexEntryLW(fp,indexO,RawDataEntryLabel,12,&dataO)) )
+	    goto bail_out;
+
+	/* actual size of region containing this data */
+	if (! (getIndexEntryLW(fp,indexO,RawDataEntryLabel,10,&data_size)) )
+	    goto bail_out;
+    } else {
+	/* actual size of region containing this data */
+	if (! (getIndexEntryLW(fp,indexO,DataEntryLabel,10,&data_size)) )
+	    goto bail_out;
+    }
+    
+    /* Because each trace value is stored in a 2 byte
+     * integer, thus to store A C G T information
+     * it takes 8 bytes.  So subtract off the header and
+     * divide by 8
+     */
+    numPoints = (int)((data_size - header_size)/ 8); 
+    
+    /* Allocate the sequence */
+    if (NULLRead == (read = read_allocate(numPoints, BASELIMIT)))
+	goto bail_out;
+    
+    /*************************************************************
+     * Read the bases information
+     *************************************************************/
+    if (sections & READ_BASES) {
+	/* new locals introduced by LFW and/or RMD for the ALF */
+	int numBases;	/* number of nucleotides read in */
+	float bp;
+	char ch;
+
+	if (!(fseek(fp, (off_t)baseO, 0) == 0))
+	    goto bail_out;
+	
+	for (numBases = 0; (unsigned)ftell(fp) < baseO+(unsigned short)actBaseDataSize
+	                   && numBases<BASELIMIT;) {
+	    char line[200];
+
+	    fgets(line, (int)sizeof(line), fp);
+	    sscanf(line, "%c %*d %f", &ch, &bp);
+	    
+	    /* we convert ch to Staden format here */
+	    switch (ch) {
+	    case 'A':
+	    case 'C':
+	    case 'G':
+	    case 'T':
+		break;
+	    default:
+		ch = '-';
+/*
+		if (isupper(ch))
+		    ch = '-';
+		else
+		    ch = '\0';
+*/
+	    }
+	    
+	    if (ch) {
+		read->base[numBases]    = ch;
+		read->prob_A[numBases]	= 0;
+		read->prob_C[numBases]	= 0;
+		read->prob_G[numBases]	= 0;
+		read->prob_T[numBases]	= 0;
+		read->basePos[numBases] = bp;
+		++numBases;
+	    }
+	}
+	read->base[numBases] = 0;
+	
+	read->NBases  = numBases;
+    }
+    
+    /*************************************************************
+     * Read the trace information
+     *************************************************************/
+    
+    if (sections & READ_SAMPLES) {
+	
+	/*
+	 * Traces are stored as 2 byte integers in records in the order of
+	 * A C G T A C G T ...
+	 */
+	
+	if (fseek(fp, (off_t)(dataO+header_size), 0) != 0) 
+	    goto bail_out;
+	
+	num_points = 0;
+	
+	for (i=0; i < read->NPoints; i++) {
+	    if (!le_read_int_2(fp, &(read->traceA[i])))
+		goto bail_out;
+	    if (read->maxTraceVal < read->traceA[i])
+		read->maxTraceVal = read->traceA[i];
+	    
+	    if (!le_read_int_2(fp, &(read->traceC[i])))
+		goto bail_out;
+	    if (read->maxTraceVal < read->traceC[i])
+		read->maxTraceVal = read->traceC[i];
+	    
+	    if (!le_read_int_2(fp, &(read->traceG[i])))
+		goto bail_out;
+	    if (read->maxTraceVal < read->traceG[i])
+		read->maxTraceVal = read->traceG[i];
+	    
+	    if (!le_read_int_2(fp, &(read->traceT[i])))
+		goto bail_out;
+	    if (read->maxTraceVal < read->traceT[i])
+		read->maxTraceVal = read->traceT[i];
+	    
+	    if (read->traceA[i]==0 && read->traceT[i]==0 &&
+		read->traceC[i]==0 && read->traceG[i]==0 &&
+		i > (numPoints-64))
+		break;
+	    
+	    num_points++;
+	}
+    }
+    
+    /* SUCCESS */
+
+    read->format = TT_ALF;
+    return(read);
+
+    /* FAILURE */
+ bail_out:
+    if (read)
+	read_deallocate(read);
+
+    return NULLRead;
+}
+
+/*
+ * Read the ALF format sequence with name `fn' into a Read structure.
+ * All printing characters (as defined by ANSII C `isprint')
+ * are accepted, but `N's are translated to `-'s. In this respect we
+ * are adhering (more or less) to the CSET_DEFAULT uncertainty code set.
+ * 
+ * Returns:
+ *   Read *	- Success, the Read structure read.
+ *   NULLRead	- Failure.
+ */
+Read *read_alf(char *fn) {
+    FILE *fp;
+    Read *read;
+
+    /* Open file */
+    if ((fp = fopen(fn, "rb")) == NULL)
+	return NULLRead;
+
+    read = fread_alf(fp);
+    fclose(fp);
+
+    if (read && (read->trace_name = (char *)xmalloc(strlen(fn)+1)))
+	strcpy(read->trace_name, fn);
+
+    return read;
+}
+
+/*
+ * Write to an ALF file - unsupported.
+ */
+/* ARGSUSED */
+int write_alf(char *fn, Read *read) {
+    fprintf(stderr, "ALF write support is unavailable\n");
+    return -1;
+}
+
+/*
+ * Write to an ALF file - unsupported.
+ */
+/* ARGSUSED */
+int fwrite_alf(FILE *fp, Read *read) {
+    fprintf(stderr, "ALF write support is unavailable\n");
+    return -1;
+}