clustalomega: clustalomega/clustal-omega-1.0.2/src/squid/sqio.c comparison

comparison clustalomega/clustal-omega-1.0.2/src/squid/sqio.c @ 1:bc707542e5de

Uploaded

author	clustalomega
date	Thu, 21 Jul 2011 13:35:08 -0400
parents
children

comparison

equal deleted inserted replaced

-:ff1768533a07
+:bc707542e5de
+/*****************************************************************
+* SQUID - a library of functions for biological sequence analysis
+* Copyright (C) 1992-2002 Washington University School of Medicine
+*
+*     This source code is freely distributed under the terms of the
+*     GNU General Public License. See the files COPYRIGHT and LICENSE
+*     for details.
+*****************************************************************/
+/* File: sqio.c
+* From: ureadseq.c in Don Gilbert's sequence i/o package
+*
+* Reads and writes nucleic/protein sequence in various
+* formats. Data files may have multiple sequences.
+*
+* Heavily modified from READSEQ package
+* Copyright (C) 1990 by D.G. Gilbert
+* Biology Dept., Indiana University, Bloomington, IN 47405
+* email: gilbertd@bio.indiana.edu
+* Thanks Don!
+*
+* SRE: Modifications as noted. Fri Jul  3 09:44:54 1992
+*      Packaged for squid, Thu Oct  1 10:07:11 1992
+*      ANSI conversion in full swing, Mon Jul 12 12:22:21 1993
+*
+* CVS $Id: sqio.c,v 1.29 2002/08/26 23:10:52 eddy Exp)
+*
+*****************************************************************
+* Basic API for single sequence reading:
+*
+* SQFILE *sqfp;
+* char   *seqfile;
+* int     format;        - see squid.h for formats; example: SQFILE_FASTA
+* char   *seq;
+* SQINFO  sqinfo;
+*
+* if ((sqfp = SeqfileOpen(seqfile, format, "BLASTDB")) == NULL)
+*   Die("Failed to open sequence database file %s\n%s\n", seqfile, usage);
+* while (ReadSeq(sqfp, sqfp->format, &seq, &sqinfo)) {
+*   do_stuff;
+*   FreeSequence(seq, &sqinfo);
+* }
+* SeqfileClose(sqfp);
+*
+*****************************************************************
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#ifndef SEEK_SET
+#include <unistd.h>
+#endif
+#include "squid.h"
+#include "msa.h"
+#include "ssi.h"
+static void SeqfileGetLine(SQFILE *V);
+#define kStartLength  500
+static char *aminos      = "ABCDEFGHIKLMNPQRSTVWXYZ*";
+static char *primenuc    = "ACGTUN";
+static char *protonly    = "EFIPQZ";
+static SQFILE *seqfile_open(char *filename, int format, char *env, int ssimode);
+/* Function: SeqfileOpen()
+*
+* Purpose : Open a sequence database file and prepare for reading
+*           sequentially.
+*
+* Args:     filename - name of file to open
+*           format   - format of file
+*           env      - environment variable for path (e.g. BLASTDB)
+*           ssimode  - -1, SSI_OFFSET_I32, or SSI_OFFSET_I64
+*
+*           Returns opened SQFILE ptr, or NULL on failure.
+*/
+SQFILE *
+SeqfileOpen(char *filename, int format, char *env)
+{
+return seqfile_open(filename, format, env, -1);
+}
+SQFILE *
+SeqfileOpenForIndexing(char *filename, int format, char *env, int ssimode)
+{
+return seqfile_open(filename, format, env, ssimode);
+}
+static SQFILE *
+seqfile_open(char *filename, int format, char *env, int ssimode)
+{
+SQFILE *dbfp;
+dbfp = (SQFILE *) MallocOrDie (sizeof(SQFILE));
+dbfp->ssimode  = ssimode;
+dbfp->rpl      = -1;		/* flag meaning "unset" */
+dbfp->lastrpl  = 0;
+dbfp->maxrpl   = 0;
+dbfp->bpl      = -1;		/* flag meaning "unset" */
+dbfp->lastbpl  = 0;
+dbfp->maxbpl   = 0;
+/* Open our file handle.
+* Three possibilities:
+*    1. normal file open
+*    2. filename = "-";    read from stdin
+*    3. filename = "*.gz"; read thru pipe from gzip
+* If we're reading from stdin or a pipe, we can't reliably
+* back up, so we can't do two-pass parsers like the interleaved alignment
+* formats.
+*/
+if (strcmp(filename, "-") == 0)
+{
+dbfp->f         = stdin;
+dbfp->do_stdin  = TRUE;
+dbfp->do_gzip   = FALSE;
+dbfp->fname     = sre_strdup("[STDIN]", -1);
+}
+#ifndef SRE_STRICT_ANSI
+/* popen(), pclose() aren't portable to non-POSIX systems; disable */
+else if (Strparse("^.*\\.gz$", filename, 0))
+{
+char cmd[256];
+/* Note that popen() will return "successfully"
+* if file doesn't exist, because gzip works fine
+* and prints an error! So we have to check for
+* existence of file ourself.
+*/
+if (! FileExists(filename))
+	Die("%s: file does not exist", filename);
+if (strlen(filename) + strlen("gzip -dc ") >= 256)
+	Die("filename > 255 char in SeqfileOpen()");
+sprintf(cmd, "gzip -dc %s", filename);
+if ((dbfp->f = popen(cmd, "r")) == NULL)
+	return NULL;
+dbfp->do_stdin = FALSE;
+dbfp->do_gzip  = TRUE;
+dbfp->fname    = sre_strdup(filename, -1);
+}
+#endif /*SRE_STRICT_ANSI*/
+else
+{
+if ((dbfp->f = fopen(filename, "r")) == NULL &&
+	  (dbfp->f = EnvFileOpen(filename, env, NULL)) == NULL)
+	return NULL;
+dbfp->do_stdin = FALSE;
+dbfp->do_gzip  = FALSE;
+dbfp->fname    = sre_strdup(filename, -1);
+}
+/* Invoke autodetection if we haven't already been told what
+* to expect.
+*/
+if (format == SQFILE_UNKNOWN)
+{
+if (dbfp->do_stdin == TRUE || dbfp->do_gzip)
+	Die("Can't autodetect sequence file format from a stdin or gzip pipe");
+format = SeqfileFormat(dbfp->f);
+if (format == SQFILE_UNKNOWN)
+	Die("Can't determine format of sequence file %s", dbfp->fname);
+}
+/* The hack for sequential access of an interleaved alignment file:
+* read the alignment in, we'll copy sequences out one at a time.
+*/
+dbfp->msa        = NULL;
+dbfp->afp        = NULL;
+dbfp->format     = format;
+dbfp->linenumber = 0;
+dbfp->buf        = NULL;
+dbfp->buflen     = 0;
+if (IsAlignmentFormat(format))
+{
+/* We'll be reading from the MSA interface. Copy our data
+* to the MSA afp's structure.
+*/
+dbfp->afp           = MallocOrDie(sizeof(MSAFILE));
+dbfp->afp->f        = dbfp->f;            /* just a ptr, don't close */
+dbfp->afp->do_stdin = dbfp->do_stdin;
+dbfp->afp->do_gzip  = dbfp->do_gzip;
+dbfp->afp->fname    = dbfp->fname;        /* just a ptr, don't free */
+dbfp->afp->format   = dbfp->format;       /* e.g. format */
+dbfp->afp->linenumber = dbfp->linenumber;	/* e.g. 0 */
+dbfp->afp->buf      = NULL;
+dbfp->afp->buflen   = 0;
+if ((dbfp->msa = MSAFileRead(dbfp->afp)) == NULL)
+	Die("Failed to read any alignment data from file %s", dbfp->fname);
+				/* hack: overload/reuse msa->lastidx; indicates
+				   next seq to return upon a ReadSeq() call */
+dbfp->msa->lastidx = 0;
+return dbfp;
+}
+/* Load the first line.
+*/
+SeqfileGetLine(dbfp);
+return dbfp;
+}
+/* Function: SeqfilePosition()
+*
+* Purpose:  Move to a particular offset in a seqfile.
+*           Will not work on alignment files.
+*/
+void
+SeqfilePosition(SQFILE *sqfp, SSIOFFSET *offset)
+{
+if (sqfp->do_stdin || sqfp->do_gzip || IsAlignmentFormat(sqfp->format))
+Die("SeqfilePosition() failed: in a nonrewindable data file or stream");
+if (SSISetFilePosition(sqfp->f, offset) != 0)
+Die("SSISetFilePosition failed, but that shouldn't happen.");
+SeqfileGetLine(sqfp);
+}
+/* Function: SeqfileRewind()
+*
+* Purpose:  Set a sequence file back to the first sequence.
+*
+*           Won't work on alignment files. Although it would
+*           seem that it could (just set msa->lastidx back to 0),
+*           that'll fail on "multiple multiple" alignment file formats
+*           (e.g. Stockholm).
+*/
+void
+SeqfileRewind(SQFILE *sqfp)
+{
+if (sqfp->do_stdin || sqfp->do_gzip)
+Die("SeqfileRewind() failed: in a nonrewindable data file or stream");
+rewind(sqfp->f);
+SeqfileGetLine(sqfp);
+}
+/* Function: SeqfileLineParameters()
+* Date:     SRE, Thu Feb 15 17:00:41 2001 [St. Louis]
+*
+* Purpose:  After all the sequences have been read from the file,
+*           but before closing it, retrieve overall bytes-per-line and
+*           residues-per-line info. If non-zero, these mean that
+*           the file contains homogeneous sequence line lengths (except
+*           the last line in each record).
+*
+*           If either of bpl or rpl is determined to be inhomogeneous,
+*           both are returned as 0.
+*
+* Args:     *sqfp   - an open but fully read sequence file
+*           ret_bpl - RETURN: bytes per line, or 0 if inhomogeneous
+*           ret_rpl - RETURN: residues per line, or 0 if inhomogenous.
+*
+* Returns:  void
+*/
+void
+SeqfileLineParameters(SQFILE *V, int *ret_bpl, int *ret_rpl)
+{
+if (V->rpl > 0 && V->maxrpl == V->rpl &&
+V->bpl > 0 && V->maxbpl == V->bpl) {
+*ret_bpl = V->bpl;
+*ret_rpl = V->rpl;
+} else {
+*ret_bpl = 0;
+*ret_rpl = 0;
+}
+}
+void
+SeqfileClose(SQFILE *sqfp)
+{
+/* note: don't test for sqfp->msa being NULL. Now that
+* we're holding afp open and allowing access to multi-MSA
+* databases (e.g. Stockholm format, Pfam), msa ends
+* up being NULL when we run out of alignments.
+*/
+if (sqfp->afp != NULL) {
+if (sqfp->msa      != NULL) MSAFree(sqfp->msa);
+if (sqfp->afp->buf != NULL) free(sqfp->afp->buf);
+free(sqfp->afp);
+}
+#ifndef SRE_STRICT_ANSI	/* gunzip functionality only on POSIX systems */
+if (sqfp->do_gzip)         pclose(sqfp->f);
+#endif
+else if (! sqfp->do_stdin) fclose(sqfp->f);
+if (sqfp->buf   != NULL) free(sqfp->buf);
+if (sqfp->fname != NULL) free(sqfp->fname);
+free(sqfp);
+}
+/* Function: SeqfileGetLine()
+* Date:     SRE, Tue Jun 22 09:15:49 1999 [Sanger Centre]
+*
+* Purpose:  read a line from a sequence file into V->buf
+*           If the fgets() is NULL, sets V->buf[0] to '\0'.
+*
+* Args:     V
+*
+* Returns:  void
+*/
+static void
+SeqfileGetLine(SQFILE *V)
+{
+if (V->ssimode >= 0)
+if (0 != SSIGetFilePosition(V->f, V->ssimode, &(V->ssioffset)))
+Die("SSIGetFilePosition() failed");
+if (sre_fgets(&(V->buf), &(V->buflen), V->f) == NULL)
+*(V->buf) = '\0';
+V->linenumber++;
+}
+void
+FreeSequence(char *seq, SQINFO *sqinfo)
+{
+if (seq != NULL) free(seq); /* FS, r244, here is potential problem in profile/profile */
+if (sqinfo->flags & SQINFO_SS){
+if (NULL != sqinfo->ss){ /* FS, r244 -> r245 */
+free(sqinfo->ss);
+}
+}
+if (sqinfo->flags & SQINFO_SA){
+if (NULL != sqinfo->sa){ /* FS, r244 -> r245 */
+free(sqinfo->sa);
+}
+}
+}
+int
+SetSeqinfoString(SQINFO *sqinfo, char *sptr, int flag)
+{
+int len;
+int pos;
+				/* silently ignore NULL. */
+if (sptr == NULL) return 1;
+while (*sptr == ' ') sptr++; /* ignore leading whitespace */
+for (pos = strlen(sptr)-1; pos >= 0; pos--)
+if (! isspace((int) sptr[pos])) break;
+sptr[pos+1] = '\0';	       /* ignore trailing whitespace */
+switch (flag) {
+case SQINFO_NAME:
+if (*sptr != '-')
+{
+	strncpy(sqinfo->name, sptr, SQINFO_NAMELEN-1);
+	sqinfo->name[SQINFO_NAMELEN-1] = '\0';
+	sqinfo->flags   |= SQINFO_NAME;
+}
+break;
+case SQINFO_ID:
+if (*sptr != '-')
+{
+	strncpy(sqinfo->id, sptr, SQINFO_NAMELEN-1);
+	sqinfo->id[SQINFO_NAMELEN-1] = '\0';
+	sqinfo->flags |= SQINFO_ID;
+}
+break;
+case SQINFO_ACC:
+if (*sptr != '-')
+{
+	strncpy(sqinfo->acc, sptr, SQINFO_NAMELEN-1);
+	sqinfo->acc[SQINFO_NAMELEN-1] = '\0';
+	sqinfo->flags   |= SQINFO_ACC;
+}
+break;
+case SQINFO_DESC:
+if (*sptr != '-')
+{
+	if (sqinfo->flags & SQINFO_DESC) /* append? */
+	  {
+	    len = strlen(sqinfo->desc);
+	    if (len < SQINFO_DESCLEN-2)	/* is there room? */
+	      {
+		strncat(sqinfo->desc, " ", SQINFO_DESCLEN-1-len); len++;
+		strncat(sqinfo->desc, sptr, SQINFO_DESCLEN-1-len);
+	      }
+	  }
+	else			/* else copy */
+	  strncpy(sqinfo->desc, sptr, SQINFO_DESCLEN-1);
+	sqinfo->desc[SQINFO_DESCLEN-1] = '\0';
+	sqinfo->flags   |= SQINFO_DESC;
+}
+break;
+case SQINFO_START:
+if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }
+sqinfo->start = atoi(sptr);
+if (sqinfo->start != 0) sqinfo->flags |= SQINFO_START;
+break;
+case SQINFO_STOP:
+if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }
+sqinfo->stop = atoi(sptr);
+if (sqinfo->stop != 0) sqinfo->flags |= SQINFO_STOP;
+break;
+case SQINFO_OLEN:
+if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }
+sqinfo->olen = atoi(sptr);
+if (sqinfo->olen != 0) sqinfo->flags |= SQINFO_OLEN;
+break;
+default:
+Die("Invalid flag %d to SetSeqinfoString()", flag);
+}
+return 1;
+}
+void
+SeqinfoCopy(SQINFO *sq1, SQINFO *sq2)
+{
+sq1->flags = sq2->flags;
+if (sq2->flags & SQINFO_NAME)  strcpy(sq1->name, sq2->name);
+if (sq2->flags & SQINFO_ID)    strcpy(sq1->id,   sq2->id);
+if (sq2->flags & SQINFO_ACC)   strcpy(sq1->acc,  sq2->acc);
+if (sq2->flags & SQINFO_DESC)  strcpy(sq1->desc, sq2->desc);
+if (sq2->flags & SQINFO_LEN)   sq1->len    = sq2->len;
+if (sq2->flags & SQINFO_START) sq1->start  = sq2->start;
+if (sq2->flags & SQINFO_STOP)  sq1->stop   = sq2->stop;
+if (sq2->flags & SQINFO_OLEN)  sq1->olen   = sq2->olen;
+if (sq2->flags & SQINFO_TYPE)  sq1->type   = sq2->type;
+if (sq2->flags & SQINFO_SS)    sq1->ss     = Strdup(sq2->ss);
+if (sq2->flags & SQINFO_SA)    sq1->sa     = Strdup(sq2->sa);
+}
+/* Function: ToDNA()
+*
+* Purpose:  Convert a sequence to DNA.
+*           U --> T
+*/
+void
+ToDNA(char *seq)
+{
+for (; *seq != '\0'; seq++)
+{
+if      (*seq == 'U') *seq = 'T';
+else if (*seq == 'u') *seq = 't';
+}
+}
+/* Function: ToRNA()
+*
+* Purpose:  Convert a sequence to RNA.
+*           T --> U
+*/
+void
+ToRNA(char *seq)
+{
+for (; *seq != '\0'; seq++)
+{
+if      (*seq == 'T') *seq = 'U';
+else if (*seq == 't') *seq = 'u';
+}
+}
+/* Function: ToIUPAC()
+*
+* Purpose:  Convert X's, o's, other junk in a nucleic acid sequence to N's,
+*           to comply with IUPAC code. If is_aseq is TRUE, will allow gap
+*           characters though, so we can call ToIUPAC() on aligned seqs.
+*
+*           NUCLEOTIDES is defined in squid.h as:
+*               "ACGTUNRYMKSWHBVDacgtunrymkswhbvd"
+*           gap chars allowed by isgap() are defined in squid.h as:
+*               " ._-~"
+*
+*           WU-BLAST's pressdb will
+*           choke on X's, for instance, necessitating conversion
+*           of certain genome centers' data.
+*/
+void
+ToIUPAC(char *seq, int is_aseq)
+{
+if (is_aseq) {
+for (; *seq != '\0'; seq++)
+if (strchr(NUCLEOTIDES, *seq) == NULL && ! isgap(*seq)) *seq = 'N';
+} else {
+for (; *seq != '\0'; seq++)
+if (strchr(NUCLEOTIDES, *seq) == NULL) *seq = 'N';
+}
+}
+/* Function: addseq()
+*
+* Purpose:  Add a line of sequence to the growing string in V.
+*
+*           In the seven supported unaligned formats, all sequence
+*           lines may contain whitespace that must be filtered out;
+*           four formats (PIR, EMBL, Genbank, GCG) include coordinates
+*           that must be filtered out. Thus an (!isdigit && !isspace)
+*           test on each character before we accept it.
+*/
+static void
+addseq(char *s, struct ReadSeqVars *V)
+{
+char *s0;
+char *sq;
+int   rpl;			/* valid residues per line */
+int   bpl;			/* characters per line     */
+if (V->ssimode == -1)
+{				/* Normal mode: keeping the seq */
+/* Make sure we have enough room. We know that s is <= buflen,
+* so just make sure we've got room for a whole new buflen worth
+* of sequence.
+*/
+if (V->seqlen + V->buflen > V->maxseq) {
+	V->maxseq += MAX(V->buflen, kStartLength);
+	V->seq = ReallocOrDie (V->seq, V->maxseq+1);
+}
+sq = V->seq + V->seqlen;
+while (*s != 0) {
+#ifdef CLUSTALO
+if (! isdigit((int) *s) && ! isspace((int) *s) && isprint((int) *s)) {
+#else
+	if (! isdigit((int) *s) && ! isspace((int) *s)) {
+#endif
+	  *sq = *s;
+	  sq++;
+	}
+	s++;
+}
+V->seqlen = sq - V->seq;
+}
+else				/* else: indexing mode, discard the seq */
+{
+s0 = s;
+rpl = 0;
+while (*s != 0) {
+	if (! isdigit((int) *s) && ! isspace((int) *s)) {
+	  rpl++;
+	}
+	s++;
+}
+V->seqlen += rpl;
+bpl = s - s0;
+/* Keep track of the global rpl, bpl for the file.
+* This is overly complicated because we have to
+* allow the last line of each record (e.g. the last addseq() call
+* on each sequence) to have a different length - and sometimes
+* we'll have one-line sequence records, too.  Thus we only
+* do something with the global V->rpl when we have *passed over*
+* a line - we keep the last line's rpl in last_rpl. And because
+* a file might consist entirely of single-line records, we keep
+* a third guy, maxrpl, that tells us the maximum rpl of any line
+* in the file. If we reach the end of file and rpl is still unset,
+* we'll set it to maxrpl. If we reach eof and rpl is set, but is
+* less than maxrpl, that's a weird case where a last line in some
+* record is longer than every other line.
+*/
+if (V->rpl != 0) {		/* 0 means we already know rpl is invalid       */
+	if (V->lastrpl > 0) {	/* we're on something that's not the first line */
+	  if (V->rpl > 0 && V->lastrpl != V->rpl)  V->rpl = 0;
+	  else if (V->rpl == -1)                   V->rpl = V->lastrpl;
+	}
+	V->lastrpl = rpl;
+	if (rpl > V->maxrpl) V->maxrpl = rpl; /* make sure we check max length of final lines */
+}
+if (V->bpl != 0) {		/* 0 means we already know bpl is invalid       */
+	if (V->lastbpl > 0) {	/* we're on something that's not the first line */
+	  if (V->bpl > 0 && V->lastbpl != V->bpl)  V->bpl = 0;
+	  else if (V->bpl == -1)                   V->bpl = V->lastbpl;
+	}
+	V->lastbpl = bpl;
+	if (bpl > V->maxbpl) V->maxbpl = bpl; /* make sure we check max length of final lines */
+}
+} /* end of indexing mode of addseq(). */
+}
+static void
+readLoop(int addfirst, int (*endTest)(char *,int *), struct ReadSeqVars *V)
+{
+int addend = 0;
+int done   = 0;
+V->seqlen = 0;
+V->lastrpl = V->lastbpl = 0;
+if (addfirst) {
+if (V->ssimode >= 0) V->d_off = V->ssioffset;
+addseq(V->buf, V);
+} else if (V->ssimode >= 0)
+if (0 != SSIGetFilePosition(V->f, V->ssimode, &(V->d_off)))
+Die("SSIGetFilePosition() failed");
+do {
+SeqfileGetLine(V);
+	/* feof() alone is a bug; files not necessarily \n terminated */
+if (*(V->buf) == '\0' && feof(V->f))
+done = TRUE;
+done |= (*endTest)(V->buf, &addend);
+if (addend || !done)
+addseq(V->buf, V);
+} while (!done);
+}
+static int
+endPIR(char *s, int  *addend)
+{
+*addend = 0;
+if ((strncmp(s, "///", 3) == 0) ||
+(strncmp(s, "ENTRY", 5) == 0))
+return 1;
+else
+return 0;
+}
+static void
+readPIR(struct ReadSeqVars *V)
+{
+char *sptr;
+				/* load first line of entry  */
+while (!feof(V->f) && strncmp(V->buf, "ENTRY", 5) != 0) {
+SeqfileGetLine(V);
+}
+if (feof(V->f)) return;
+if (V->ssimode >= 0) V->r_off = V->ssioffset;
+if ((sptr = strtok(V->buf + 15, "\n\t ")) != NULL)
+{
+SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
+SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
+}
+do {
+SeqfileGetLine(V);
+if (!feof(V->f) && strncmp(V->buf, "TITLE", 5) == 0)
+SetSeqinfoString(V->sqinfo, V->buf+15, SQINFO_DESC);
+else if (!feof(V->f) && strncmp(V->buf, "ACCESSION", 9) == 0)
+{
+	if ((sptr = strtok(V->buf+15, " \t\n")) != NULL)
+	  SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
+}
+} while (! feof(V->f) && (strncmp(V->buf,"SEQUENCE", 8) != 0));
+SeqfileGetLine(V);			/* skip next line, coords */
+readLoop(0, endPIR, V);
+/* reading a real PIR-CODATA database file, we keep the source coords
+*/
+V->sqinfo->start = 1;
+V->sqinfo->stop  = V->seqlen;
+V->sqinfo->olen  = V->seqlen;
+V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;
+/* get next line
+*/
+while (!feof(V->f) && strncmp(V->buf, "ENTRY", 5) != 0) {
+SeqfileGetLine(V);
+}
+}
+static int
+endIG(char *s, int  *addend)
+{
+*addend = 1; /* 1 or 2 occur in line w/ bases */
+return((strchr(s,'1')!=NULL) || (strchr(s,'2')!=NULL));
+}
+static void
+readIG(struct ReadSeqVars *V)
+{
+char *nm;
+				/* position past ';' comments */
+do {
+SeqfileGetLine(V);
+} while (! (feof(V->f) || ((*V->buf != 0) && (*V->buf != ';')) ));
+if (!feof(V->f))
+{
+if ((nm = strtok(V->buf, "\n\t ")) != NULL)
+	SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME);
+readLoop(0, endIG, V);
+}
+while (!(feof(V->f) || ((*V->buf != '\0') && (*V->buf == ';'))))
+SeqfileGetLine(V);
+}
+static int
+endStrider(char *s, int *addend)
+{
+*addend = 0;
+return (strstr( s, "//") != NULL);
+}
+static void
+readStrider(struct ReadSeqVars *V)
+{
+char *nm;
+while ((!feof(V->f)) && (*V->buf == ';'))
+{
+if (strncmp(V->buf,"; DNA sequence", 14) == 0)
+	{
+	  if ((nm = strtok(V->buf+16, ",\n\t ")) != NULL)
+	    SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME);
+	}
+SeqfileGetLine(V);
+}
+if (! feof(V->f))
+readLoop(1, endStrider, V);
+/* load next line
+*/
+while ((!feof(V->f)) && (*V->buf != ';'))
+SeqfileGetLine(V);
+}
+static int
+endGB(char *s, int *addend)
+{
+*addend = 0;
+return ((strstr(s,"//") != NULL) || (strstr(s,"LOCUS") == s));
+}
+static void
+readGenBank(struct ReadSeqVars *V)
+{
+char *sptr;
+int   in_definition;
+/* We'll map three genbank identifiers onto names:
+*     LOCUS     -> sqinfo.name
+*     ACCESSION -> sqinfo.acc   [primary accession only]
+*     VERSION   -> sqinfo.id
+* We don't currently store the GI number, or secondary accessions.
+*/
+while (strncmp(V->buf, "LOCUS", 5) != 0) {
+SeqfileGetLine(V);
+}
+if (V->ssimode >= 0) V->r_off = V->ssioffset;
+if ((sptr = strtok(V->buf+12, "\n\t ")) != NULL)
+SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
+in_definition = FALSE;
+while (! feof(V->f))
+{
+SeqfileGetLine(V);
+if (! feof(V->f) && strstr(V->buf, "DEFINITION") == V->buf)
+	{
+	  if ((sptr = strtok(V->buf+12, "\n")) != NULL)
+	    SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
+	  in_definition = TRUE;
+	}
+else if (! feof(V->f) && strstr(V->buf, "ACCESSION") == V->buf)
+	{
+	  if ((sptr = strtok(V->buf+12, "\n\t ")) != NULL)
+	    SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
+	  in_definition = FALSE;
+	}
+else if (! feof(V->f) && strstr(V->buf, "VERSION") == V->buf)
+	{
+	  if ((sptr = strtok(V->buf+12, "\n\t ")) != NULL)
+	    SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
+	  in_definition = FALSE;
+	}
+else if (strncmp(V->buf,"ORIGIN", 6) != 0)
+	{
+	  if (in_definition)
+	    SetSeqinfoString(V->sqinfo, V->buf, SQINFO_DESC);
+	}
+else
+	break;
+}
+readLoop(0, endGB, V);
+/* reading a real GenBank database file, we keep the source coords
+*/
+V->sqinfo->start = 1;
+V->sqinfo->stop  = V->seqlen;
+V->sqinfo->olen  = V->seqlen;
+V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;
+while (!(feof(V->f) || ((*V->buf!=0) && (strstr(V->buf,"LOCUS") == V->buf))))
+SeqfileGetLine(V);
+				/* SRE: V->s now holds "//", so sequential
+				   reads are wedged: fixed Tue Jul 13 1993 */
+while (!feof(V->f) && strstr(V->buf, "LOCUS  ") != V->buf)
+SeqfileGetLine(V);
+}
+static int
+endGCGdata(char *s, int *addend)
+{
+*addend = 0;
+return (*s == '>');
+}
+static void
+readGCGdata(struct ReadSeqVars *V)
+{
+int   binary = FALSE;		/* whether data are binary or not */
+int   blen = 0;		/* length of binary sequence */
+				/* first line contains ">>>>" followed by name */
+if (Strparse(">>>>([^ ]+) .+2BIT +Len: ([0-9]+)", V->buf, 2))
+{
+binary = TRUE;
+SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME);
+blen = atoi(sqd_parse[2]);
+}
+else if (Strparse(">>>>([^ ]+) .+ASCII +Len: [0-9]+", V->buf, 1))
+SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME);
+else
+Die("bogus GCGdata format? %s", V->buf);
+				/* second line contains free text description */
+SeqfileGetLine(V);
+SetSeqinfoString(V->sqinfo, V->buf, SQINFO_DESC);
+if (binary) {
+/* allocate for blen characters +3... (allow for 3 bytes of slop) */
+if (blen >= V->maxseq) {
+V->maxseq = blen;
+if ((V->seq = (char *) realloc (V->seq, sizeof(char)*(V->maxseq+4)))==NULL)
+	Die("malloc failed");
+}
+				/* read (blen+3)/4 bytes from file */
+if (fread(V->seq, sizeof(char), (blen+3)/4, V->f) < (size_t) ((blen+3)/4))
+Die("fread failed");
+V->seqlen = blen;
+				/* convert binary code to seq */
+GCGBinaryToSequence(V->seq, blen);
+}
+else readLoop(0, endGCGdata, V);
+while (!(feof(V->f) || ((*V->buf != 0) && (*V->buf == '>'))))
+SeqfileGetLine(V);
+}
+static int
+endPearson(char *s, int *addend)
+{
+*addend = 0;
+return(*s == '>');
+}
+static void
+readPearson(struct ReadSeqVars *V)
+{
+char *sptr;
+if (V->ssimode >= 0) V->r_off = V->ssioffset;
+if (*V->buf != '>')
+Die("\
+File %s does not appear to be in FASTA format at line %d.\n\
+You may want to specify the file format on the command line.\n\
+Usually this is done with an option --informat <fmt>.\n",
+	V->fname, V->linenumber);
+if ((sptr = strtok(V->buf+1, "\n\t ")) != NULL)
+SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
+if ((sptr = strtok(NULL, "\n")) != NULL)
+SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
+readLoop(0, endPearson, V);
+while (!(feof(V->f) || ((*V->buf != 0) && (*V->buf == '>')))) {
+SeqfileGetLine(V);
+}
+}
+static int
+endEMBL(char *s, int *addend)
+{
+*addend = 0;
+/* Some people (Berlin 5S rRNA database, f'r instance) use
+* an extended EMBL format that attaches extra data after
+* the sequence -- watch out for that. We use the fact that
+* real EMBL sequence lines begin with five spaces.
+*
+* We can use this as the sole end test because readEMBL() will
+* advance to the next ID line before starting to read again.
+*/
+return (strncmp(s,"     ",5) != 0);
+/*  return ((strstr(s,"//") != NULL) || (strstr(s,"ID   ") == s)); */
+}
+static void
+readEMBL(struct ReadSeqVars *V)
+{
+char *sptr;
+				/* make sure we have first line */
+while (!feof(V->f) && strncmp(V->buf, "ID  ", 4) != 0) {
+SeqfileGetLine(V);
+}
+if (V->ssimode >= 0) V->r_off = V->ssioffset;
+if ((sptr = strtok(V->buf+5, "\n\t ")) != NULL)
+{
+SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
+SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
+}
+do {
+SeqfileGetLine(V);
+if (!feof(V->f) && strstr(V->buf, "AC  ") == V->buf)
+{
+	if ((sptr = strtok(V->buf+5, ";  \t\n")) != NULL)
+	  SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
+}
+else if (!feof(V->f) && strstr(V->buf, "DE  ") == V->buf)
+{
+	if ((sptr = strtok(V->buf+5, "\n")) != NULL)
+	  SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
+}
+} while (! feof(V->f) && strncmp(V->buf,"SQ",2) != 0);
+readLoop(0, endEMBL, V);
+/* Hack for Staden experiment files: convert - to N
+*/
+if (V->ssimode == -1)		/* if we're in ssi mode, we're not keeping the seq */
+for (sptr = V->seq; *sptr != '\0'; sptr++)
+if (*sptr == '-') *sptr = 'N';
+/* reading a real EMBL database file, we keep the source coords
+*/
+V->sqinfo->start = 1;
+V->sqinfo->stop  = V->seqlen;
+V->sqinfo->olen  = V->seqlen;
+V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;
+				/* load next record's ID line */
+while (!feof(V->f) && strncmp(V->buf, "ID  ", 4) != 0) {
+SeqfileGetLine(V);
+}
+}
+static int
+endZuker(char *s, int *addend)
+{
+*addend = 0;
+return( *s == '(' );
+}
+static void
+readZuker(struct ReadSeqVars *V)
+{
+char *sptr;
+SeqfileGetLine(V);  /*s == "seqLen seqid string..."*/
+if ((sptr = strtok(V->buf+6, " \t\n")) != NULL)
+SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
+if ((sptr = strtok(NULL, "\n")) != NULL)
+SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
+readLoop(0, endZuker, V);
+while (!(feof(V->f) | ((*V->buf != '\0') & (*V->buf == '('))))
+SeqfileGetLine(V);
+}
+static void
+readUWGCG(struct ReadSeqVars *V)
+{
+char  *si;
+char  *sptr;
+int    done;
+V->seqlen = 0;
+/*writeseq: "    %s  Length: %d  (today)  Check: %d  ..\n" */
+/*drop above or ".." from id*/
+if ((si = strstr(V->buf,"  Length: ")) != NULL) *si = 0;
+else if ((si = strstr(V->buf,"..")) != NULL)    *si = 0;
+if ((sptr = strtok(V->buf, "\n\t ")) != NULL)
+SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
+do {
+done = feof(V->f);
+SeqfileGetLine(V);
+if (! done) addseq(V->buf, V);
+} while (!done);
+}
+/* Function: ReadSeq()
+*
+* Purpose:  Read next sequence from an open database file.
+*           Return the sequence and associated info.
+*
+* Args:     fp      - open sequence database file pointer
+*           format  - format of the file (previously determined
+*                      by call to SeqfileFormat()).
+*                      Currently unused, since we carry it in V.
+*           ret_seq - RETURN: sequence
+*           sqinfo  - RETURN: filled in w/ other information
+*
+* Limitations: uses squid_errno, so it's not threadsafe.
+*
+* Return:   1 on success, 0 on failure.
+*           ret_seq and some field of sqinfo are allocated here,
+*           The preferred call mechanism to properly free the memory is:
+*
+*           SQINFO sqinfo;
+*           char  *seq;
+*
+*           ReadSeq(fp, format, &seq, &sqinfo);
+*           ... do something...
+*           FreeSequence(seq, &sqinfo);
+*/
+int
+ReadSeq(SQFILE *V, int format, char **ret_seq, SQINFO *sqinfo)
+{
+int    gotuw;
+squid_errno = SQERR_OK;
+/* Here's the hack for sequential access of sequences from
+* the multiple sequence alignment formats
+*/
+if (IsAlignmentFormat(V->format))
+{
+if (V->msa->lastidx >= V->msa->nseq)
+	{ /* out of data. try to read another alignment */
+	  MSAFree(V->msa);
+	  if ((V->msa = MSAFileRead(V->afp)) == NULL)
+	    return 0;
+	  V->msa->lastidx = 0;
+	}
+				/* copy and dealign the appropriate aligned seq */
+/* AW: stopping squid from dealigning sequences and corresponding info */
+#ifdef CLUSTALO
+V->seq = sre_strdup(V->msa->aseq[V->msa->lastidx], V->msa->alen);
+#else
+MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen,
+V->msa->aseq[V->msa->lastidx], &(V->seq));
+#endif
+V->seqlen = strlen(V->seq);
+/* Extract sqinfo stuff for this sequence from the msa.
+* Tedious; code that should be cleaned.
+*/
+sqinfo->flags = 0;
+if (V->msa->sqname[V->msa->lastidx] != NULL)
+	SetSeqinfoString(sqinfo, V->msa->sqname[V->msa->lastidx], SQINFO_NAME);
+if (V->msa->sqacc != NULL && V->msa->sqacc[V->msa->lastidx] != NULL)
+	SetSeqinfoString(sqinfo, V->msa->sqacc[V->msa->lastidx], SQINFO_ACC);
+if (V->msa->sqdesc != NULL && V->msa->sqdesc[V->msa->lastidx] != NULL)
+	SetSeqinfoString(sqinfo, V->msa->sqdesc[V->msa->lastidx], SQINFO_DESC);
+if (V->msa->ss != NULL && V->msa->ss[V->msa->lastidx] != NULL) {
+/* AW: stopping squid from dealigning sequences and corresponding info */
+#ifdef CLUSTALO
+sqinfo->ss = sre_strdup(V->msa->ss[V->msa->lastidx], V->msa->alen);
+#else
+MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen,
+V->msa->ss[V->msa->lastidx], &(sqinfo->ss));
+#endif
+sqinfo->flags |= SQINFO_SS;
+}
+if (V->msa->sa != NULL && V->msa->sa[V->msa->lastidx] != NULL) {
+/* AW: stopping squid from dealigning sequences and corresponding info */
+#ifdef CLUSTALO
+sqinfo->sa = sre_strdup(V->msa->sa[V->msa->lastidx], V->msa->alen);
+#else
+MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen,
+V->msa->sa[V->msa->lastidx], &(sqinfo->sa));
+#endif
+	sqinfo->flags |= SQINFO_SA;
+}
+V->msa->lastidx++;
+}
+else {
+if (feof(V->f)) return 0;
+if (V->ssimode == -1) {	/* normal mode */
+V->seq           = (char*) calloc (kStartLength+1, sizeof(char));
+V->maxseq        = kStartLength;
+} else {			/* index mode: discarding seq */
+V->seq           = NULL;
+V->maxseq        = 0;
+}
+V->seqlen        = 0;
+V->sqinfo        = sqinfo;
+V->sqinfo->flags = 0;
+switch (V->format) {
+case SQFILE_IG      : readIG(V);      break;
+case SQFILE_STRIDER : readStrider(V); break;
+case SQFILE_GENBANK : readGenBank(V); break;
+case SQFILE_FASTA   : readPearson(V); break;
+case SQFILE_EMBL    : readEMBL(V);    break;
+case SQFILE_ZUKER   : readZuker(V);   break;
+case SQFILE_PIR     : readPIR(V);     break;
+case SQFILE_GCGDATA : readGCGdata(V); break;
+case SQFILE_GCG :
+do {			/* skip leading comments on GCG file */
+	gotuw = (strstr(V->buf,"..") != NULL);
+	if (gotuw) readUWGCG(V);
+	SeqfileGetLine(V);
+} while (! feof(V->f));
+break;
+case SQFILE_IDRAW:   /* SRE: no attempt to read idraw postscript */
+default:
+squid_errno = SQERR_FORMAT;
+free(V->seq);
+return 0;
+}
+if (V->seq != NULL)		/* (it can be NULL in indexing mode) */
+V->seq[V->seqlen] = 0; /* stick a string terminator on it */
+}
+/* Cleanup
+*/
+sqinfo->len    = V->seqlen;
+sqinfo->flags |= SQINFO_LEN;
+*ret_seq = V->seq;
+if (squid_errno == SQERR_OK) return 1; else return 0;
+}
+/* Function: SeqfileFormat()
+* Date:     SRE, Tue Jun 22 10:58:58 1999 [Sanger Centre]
+*
+* Purpose:  Determine format of an open file.
+*           Returns format code.
+*           Rewinds the file.
+*
+*           Autodetects the following unaligned formats:
+*              SQFILE_FASTA
+*              SQFILE_GENBANK
+*              SQFILE_EMBL
+*              SQFILE_GCG
+*              SQFILE_GCGDATA
+*              SQFILE_PIR
+*           Also autodetects the following alignment formats:
+*              MSAFILE_STOCKHOLM
+*              MSAFILE_MSF
+*              MSAFILE_CLUSTAL
+*              MSAFILE_SELEX
+*              MSAFILE_PHYLIP
+*
+*           Can't autodetect MSAFILE_A2M, calls it SQFILE_FASTA.
+*           MSAFileFormat() does the opposite.
+*
+* Args:     sfp -  open SQFILE
+*
+* Return:   format code, or SQFILE_UNKNOWN if unrecognized
+*/
+int
+SeqfileFormat(FILE *fp)
+{
+char *buf;
+int   len;
+int   fmt = SQFILE_UNKNOWN;
+int   ndataline;
+char *bufcpy, *s, *s1, *s2;
+int   has_junk;
+buf       = NULL;
+len       = 0;
+ndataline = 0;
+has_junk  = FALSE;
+while (sre_fgets(&buf, &len, fp) != NULL)
+{
+if (IsBlankline(buf)) continue;
+/* Well-behaved formats identify themselves in first nonblank line.
+*/
+if (ndataline == 0)
+	{
+	  if (strncmp(buf, ">>>>", 4) == 0 && strstr(buf, "Len: "))
+	    { fmt = SQFILE_GCGDATA; goto DONE; }
+	  if (buf[0] == '>')
+	    { fmt = SQFILE_FASTA; goto DONE; }
+	  if (strncmp(buf, "!!AA_SEQUENCE", 13) == 0 ||
+	      strncmp(buf, "!!NA_SEQUENCE", 13) == 0)
+	    { fmt = SQFILE_GCG; goto DONE; }
+	  if (strncmp(buf, "# STOCKHOLM 1.", 14) == 0)
+	    { fmt = MSAFILE_STOCKHOLM; goto DONE; }
+	  if (strncmp(buf, "CLUSTAL", 7) == 0 &&
+	      strstr(buf, "multiple sequence alignment") != NULL)
+	    { fmt = MSAFILE_CLUSTAL; goto DONE; }
+	  if (strncmp(buf, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0 ||
+	      strncmp(buf, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0)
+	    { fmt = MSAFILE_MSF; goto DONE; }
+	  			/* PHYLIP id: also just a good bet */
+	  bufcpy = sre_strdup(buf, -1);
+	  s = bufcpy;
+	  if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) != NULL &&
+	      (s2 = sre_strtok(&s, WHITESPACE, NULL)) != NULL &&
+	      IsInt(s1) &&
+	      IsInt(s2))
+	    { free(bufcpy); fmt = MSAFILE_PHYLIP; goto DONE; }
+	  free(bufcpy);
+	}
+/* We trust that other formats identify themselves soon.
+*/
+				/* dead giveaways for extended SELEX */
+if (strncmp(buf, "#=AU", 4) == 0 ||
+strncmp(buf, "#=ID", 4) == 0 ||
+	  strncmp(buf, "#=AC", 4) == 0 ||
+	  strncmp(buf, "#=DE", 4) == 0 ||
+	  strncmp(buf, "#=GA", 4) == 0 ||
+	  strncmp(buf, "#=TC", 4) == 0 ||
+	  strncmp(buf, "#=NC", 4) == 0 ||
+	  strncmp(buf, "#=SQ", 4) == 0 ||
+	  strncmp(buf, "#=SS", 4) == 0 ||
+	  strncmp(buf, "#=CS", 4) == 0 ||
+	  strncmp(buf, "#=RF", 4) == 0)
+	{ fmt = MSAFILE_SELEX; goto DONE; }
+if (strncmp(buf, "///", 3) == 0 || strncmp(buf, "ENTRY ", 6) == 0)
+	{ fmt = SQFILE_PIR; goto DONE; }
+				/* a ha, diagnostic of an (old) MSF file */
+if ((strstr(buf, "..")    != NULL) &&
+	  (strstr(buf, "MSF:")  != NULL) &&
+	  (strstr(buf, "Check:")!= NULL))
+	{ fmt = MSAFILE_MSF; goto DONE; }
+				/* unaligned GCG (must follow MSF test!) */
+if (strstr(buf, " Check: ") != NULL && strstr(buf, "..") != NULL)
+	{ fmt = SQFILE_GCG; goto DONE; }
+if (strncmp(buf,"LOCUS ",6) == 0 || strncmp(buf,"ORIGIN ",6) == 0)
+	{ fmt = SQFILE_GENBANK; goto DONE; }
+if (strncmp(buf,"ID   ",5) == 0 || strncmp(buf,"SQ   ",5) == 0)
+	{ fmt = SQFILE_EMBL; goto DONE; }
+/* But past here, we're being desperate. A simple SELEX file is
+* very difficult to detect; we can only try to disprove it.
+*/
+s = buf;
+if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue; /* skip blank lines */
+if (strchr("#%", *s1) != NULL) continue;   /* skip comment lines */
+/* Disproof 1. Noncomment, nonblank lines in a SELEX file
+* must have at least two space-delimited fields (name/seq)
+*/
+if ((s2 = sre_strtok(&s, WHITESPACE, NULL)) == NULL)
+	has_junk = TRUE;
+/* Disproof 2.
+* The sequence field should look like a sequence.
+*/
+if (s2 != NULL && Seqtype(s2) == kOtherSeq)
+	has_junk = TRUE;
+ndataline++;
+if (ndataline == 300) break; /* only look at first 300 lines */
+}
+if (ndataline == 0)
+Die("Sequence file contains no data");
+/* If we've made it this far, we've run out of data, but there
+* was at least one line of it; check if we've
+* disproven SELEX. If not, cross our fingers, pray, and guess SELEX.
+*/
+if (has_junk == TRUE) fmt = SQFILE_UNKNOWN;
+else                  fmt = MSAFILE_SELEX;
+DONE:
+if (buf != NULL) free(buf);
+rewind(fp);
+return fmt;
+}
+/* Function: GCGBinaryToSequence()
+*
+* Purpose:  Convert a GCG 2BIT binary string to DNA sequence.
+*           0 = C  1 = T  2 = A  3 = G
+*           4 nts/byte
+*
+* Args:     seq  - binary sequence. Converted in place to DNA.
+*           len  - length of DNA. binary is (len+3)/4 bytes
+*/
+int
+GCGBinaryToSequence(char *seq, int len)
+{
+int   bpos;			/* position in binary   */
+int   spos;			/* position in sequence */
+char  twobit;
+int   i;
+for (bpos = (len-1)/4; bpos >= 0; bpos--)
+{
+twobit = seq[bpos];
+spos   = bpos*4;
+for (i = 3; i >= 0; i--)
+	{
+	  switch (twobit & 0x3) {
+	  case 0: seq[spos+i] = 'C'; break;
+	  case 1: seq[spos+i] = 'T'; break;
+	  case 2: seq[spos+i] = 'A'; break;
+	  case 3: seq[spos+i] = 'G'; break;
+	  }
+	  twobit = twobit >> 2;
+	}
+}
+seq[len] = '\0';
+return 1;
+}
+/* Function: GCGchecksum()
+* Date:     SRE, Mon May 31 11:13:21 1999 [St. Louis]
+*
+* Purpose:  Calculate a GCG checksum for a sequence.
+*           Code provided by Steve Smith of Genetics
+*           Computer Group.
+*
+* Args:     seq -  sequence to calculate checksum for.
+*                  may contain gap symbols.
+*           len -  length of sequence (usually known,
+*                  so save a strlen() call)
+*
+* Returns:  GCG checksum.
+*/
+int
+GCGchecksum(char *seq, int len)
+{
+int i;			/* position in sequence */
+int chk = 0;			/* calculated checksum  */
+for (i = 0; i < len; i++)
+chk = (chk + (i % 57 + 1) * (sre_toupper((int) seq[i]))) % 10000;
+return chk;
+}
+/* Function: GCGMultchecksum()
+*
+* Purpose:  GCG checksum for a multiple alignment: sum of
+*           individual sequence checksums (including their
+*           gap characters) modulo 10000.
+*
+*           Implemented using spec provided by Steve Smith of
+*           Genetics Computer Group.
+*
+* Args:     seqs - sequences to be checksummed; aligned or not
+*           nseq - number of sequences
+*
+* Return:   the checksum, a number between 0 and 9999
+*/
+int
+GCGMultchecksum(char **seqs, int nseq)
+{
+int chk = 0;
+int idx;
+for (idx = 0; idx < nseq; idx++)
+chk = (chk + GCGchecksum(seqs[idx], strlen(seqs[idx]))) % 10000;
+return chk;
+}
+/* Function: Seqtype()
+*
+* Purpose:  Returns a (very good) guess about type of sequence:
+*           kDNA, kRNA, kAmino, or kOtherSeq.
+*
+*           Modified from, and replaces, Gilbert getseqtype().
+*/
+int
+Seqtype(char *seq)
+{
+int  saw;			/* how many non-gap characters I saw */
+char c;
+int  po = 0;			/* count of protein-only */
+int  nt = 0;			/* count of t's */
+int  nu = 0;			/* count of u's */
+int  na = 0;			/* count of nucleotides */
+int  aa = 0;			/* count of amino acids */
+int  no = 0;			/* count of others */
+/* Look at the first 300 non-gap characters
+*/
+#ifdef CLUSTALO
+/* VGGNGDDYLSGGTGNDTL is recognized as unknown using squid's default
+* approach.
+* We change it to the following:
+* 1. counting: ignore gaps and not alpha characters. if protein-only then
+* count as such (po). otherwise decide if amino-acid (aa) or nucleic-acid
+* (na) or unknown (no)
+*
+* 2. determine type: if we saw more unknown than aa or na, return unknown.
+* if encountered protein-only return protein-only. otherwise decide based
+* on majority. (if aa==na return na)
+*/
+for (saw = 0; *seq != '\0' && saw < 300; seq++) {
+		  c = sre_toupper((int) *seq);
+		  int unknown = 1;
+		  if (isgap(c) ||  ! isalpha((int) c))  {
+				  continue;
+		  }
+		  if (strchr(protonly, c)) {
+				  po++;
+				  unknown = 0;
+		  }
+		  if (strchr(aminos,c)) {
+				  aa++;
+				  unknown = 0;
+		  }
+		  if (strchr(primenuc,c)) {
+				  na++;
+				  unknown = 0;
+				  if (c == 'T')
+						  nt++;
+				  else if (c == 'U')
+						  nu++;
+		  }
+		  if (unknown) {
+				  no ++;
+		  }
+		  saw++;
+}
+if (no > aa && no > na)
+		  return kOtherSeq;
+if (po > 0 || aa>na)
+		  return kAmino;
+if (na >= aa) {
+		 if (nu > nt)
+				 return kRNA;
+		 else
+				 return kDNA;
+}
+return kOtherSeq;
+#else
+for (saw = 0; *seq != '\0' && saw < 300; seq++)
+{
+c = sre_toupper((int) *seq);
+if (! isgap(c))
+	{
+	  if (strchr(protonly, c)) po++;
+	  else if (strchr(primenuc,c)) {
+	    na++;
+	    if (c == 'T') nt++;
+	    else if (c == 'U') nu++;
+	  }
+	  else if (strchr(aminos,c)) aa++;
+	  else if (isalpha((int) c)) no++;
+	  saw++;
+	}
+}
+if (no > 0) return kOtherSeq;
+else if (po > 0) return kAmino;
+else if (na > aa) {
+if (nu > nt) return kRNA;
+else return kDNA;
+}
+else return kAmino;		/* ooooh. risky. */
+#endif
+}
+/* Function: GuessAlignmentSeqtype()
+* Date:     SRE, Wed Jul  7 09:42:34 1999 [St. Louis]
+*
+* Purpose:  Try to guess whether an alignment is protein
+*           or nucleic acid; return a code for the
+*           type (kRNA, kDNA, or kAmino).
+*
+* Args:     aseq  - array of aligned sequences. (Could also
+*                   be an rseq unaligned sequence array)
+*           nseq  - number of aseqs
+*
+* Returns:  kRNA, kDNA, kAmino;
+*           kOtherSeq if inconsistency is detected.
+*/
+int
+GuessAlignmentSeqtype(char **aseq, int nseq)
+{
+int idx;
+int nrna   = 0;
+int ndna   = 0;
+int namino = 0;
+int nother = 0;
+for (idx = 0; idx < nseq; idx++)
+switch (Seqtype(aseq[idx])) {
+case kRNA:   nrna++;   break;
+case kDNA:   ndna++;   break;
+case kAmino: namino++; break;
+default:     nother++;
+}
+/* Unambiguous decisions:
+*/
+if (nother)         return kOtherSeq;
+if (namino == nseq) return kAmino;
+if (ndna   == nseq) return kDNA;
+if (nrna   == nseq) return kRNA;
+/* Ambiguous decisions:
+*/
+if (namino == 0)    return kRNA; /* it's nucleic acid, but seems mixed RNA/DNA */
+return kAmino;		   /* some amino acid seen; others probably short seqs, some
+				      of which may be entirely ACGT (ala,cys,gly,thr). We
+				      could be a little more sophisticated: U would be a giveaway
+				      that we're not in protein seqs */
+}
+/* Function: WriteSimpleFASTA()
+* Date:     SRE, Tue Nov 16 18:06:00 1999 [St. Louis]
+*
+* Purpose:  Just write a FASTA format sequence to a file;
+*           minimal interface, mostly for quick and dirty programs.
+*
+* Args:     fp   - open file handle (stdout, possibly)
+*           seq  - sequence to output
+*           name - name for the sequence
+*           desc - optional description line, or NULL.
+*
+* Returns:  void
+*/
+void
+WriteSimpleFASTA(FILE *fp, char *seq, char *name, char *desc)
+{
+char buf[61];
+int  len;
+int  pos;
+len = strlen(seq);
+buf[60] = '\0';
+fprintf(fp, ">%s %s\n", name, desc != NULL ? desc : "");
+for (pos = 0; pos < len; pos += 60)
+{
+strncpy(buf, seq+pos, 60);
+fprintf(fp, "%s\n", buf);
+}
+}
+int
+WriteSeq(FILE *outf, int outform, char *seq, SQINFO *sqinfo)
+{
+int   numline = 0;
+int   lines = 0, spacer = 0, width  = 50, tab = 0;
+int   i, j, l, l1, ibase;
+char  endstr[10];
+char  s[100];			/* buffer for sequence  */
+char  ss[100];		/* buffer for structure */
+int   checksum = 0;
+int   seqlen;
+int   which_case;    /* 0 = do nothing. 1 = upper case. 2 = lower case */
+int   dostruc;		/* TRUE to print structure lines*/
+which_case = 0;
+dostruc    = FALSE;
+seqlen     = (sqinfo->flags & SQINFO_LEN) ? sqinfo->len : strlen(seq);
+if (IsAlignmentFormat(outform))
+Die("Tried to write an aligned format with WriteSeq() -- bad, bad.");
+strcpy( endstr,"");
+l1 = 0;
+checksum = GCGchecksum(seq, seqlen);
+switch (outform) {
+case SQFILE_UNKNOWN:    /* no header, just sequence */
+strcpy(endstr,"\n"); /* end w/ extra blank line */
+break;
+case SQFILE_GENBANK:
+fprintf(outf,"LOCUS       %s       %d bp\n",
+	    sqinfo->name, seqlen);
+fprintf(outf,"ACCESSION   %s\n",
+	    (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : ".");
+fprintf(outf,"DEFINITION  %s\n",
+	    (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : ".");
+fprintf(outf,"VERSION     %s\n",
+	    (sqinfo->flags & SQINFO_ID) ? sqinfo->id : ".");
+fprintf(outf,"ORIGIN      \n");
+spacer = 11;
+numline = 1;
+strcpy(endstr, "\n//");
+break;
+case SQFILE_GCGDATA:
+fprintf(outf, ">>>>%s  9/95  ASCII  Len: %d\n", sqinfo->name, seqlen);
+fprintf(outf, "%s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
+break;
+case SQFILE_PIR:
+fprintf(outf, "ENTRY          %s\n",
+	    (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name);
+fprintf(outf, "TITLE          %s\n",
+	    (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
+fprintf(outf, "ACCESSION      %s\n",
+	    (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-");
+fprintf(outf, "SUMMARY                                #Length %d  #Checksum  %d\n",
+	    sqinfo->len, checksum);
+fprintf(outf, "SEQUENCE\n");
+fprintf(outf, "                  5        10        15        20        25        30\n");
+spacer  = 2;		/* spaces after every residue */
+numline = 1;              /* number lines w/ coords     */
+width   = 30;             /* 30 aa per line             */
+strcpy(endstr, "\n///");
+break;
+case SQFILE_SQUID:
+fprintf(outf, "NAM  %s\n", sqinfo->name);
+if (sqinfo->flags & (SQINFO_ID | SQINFO_ACC | SQINFO_START | SQINFO_STOP | SQINFO_OLEN))
+fprintf(outf, "SRC  %s %s %d..%d::%d\n",
+	      (sqinfo->flags & SQINFO_ID)    ? sqinfo->id     : "-",
+	      (sqinfo->flags & SQINFO_ACC)   ? sqinfo->acc    : "-",
+	      (sqinfo->flags & SQINFO_START) ? sqinfo->start  : 0,
+	      (sqinfo->flags & SQINFO_STOP)  ? sqinfo->stop   : 0,
+	      (sqinfo->flags & SQINFO_OLEN)  ? sqinfo->olen   : 0);
+if (sqinfo->flags & SQINFO_DESC)
+fprintf(outf, "DES  %s\n", sqinfo->desc);
+if (sqinfo->flags & SQINFO_SS)
+{
+	fprintf(outf, "SEQ  +SS\n");
+	dostruc = TRUE;	/* print structure lines too */
+}
+else
+fprintf(outf, "SEQ\n");
+numline = 1;                /* number seq lines w/ coords  */
+strcpy(endstr, "\n++");
+break;
+case SQFILE_EMBL:
+fprintf(outf,"ID   %s\n",
+	    (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name);
+fprintf(outf,"AC   %s\n",
+	    (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-");
+fprintf(outf,"DE   %s\n",
+	    (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
+fprintf(outf,"SQ             %d BP\n", seqlen);
+strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/
+tab = 5;     /** added 31jan91 */
+spacer = 11; /** added 31jan91 */
+break;
+case SQFILE_GCG:
+fprintf(outf,"%s\n", sqinfo->name);
+if (sqinfo->flags & SQINFO_ACC)
+fprintf(outf,"ACCESSION   %s\n", sqinfo->acc);
+if (sqinfo->flags & SQINFO_DESC)
+fprintf(outf,"DEFINITION  %s\n", sqinfo->desc);
+fprintf(outf,"    %s  Length: %d  (today)  Check: %d  ..\n",
+	    sqinfo->name, seqlen, checksum);
+spacer = 11;
+numline = 1;
+strcpy(endstr, "\n");  /* this is insurance to help prevent misreads at eof */
+break;
+case SQFILE_STRIDER: /* ?? map ?*/
+fprintf(outf,"; ### from DNA Strider ;-)\n");
+fprintf(outf,"; DNA sequence  %s, %d bases, %d checksum.\n;\n",
+	    sqinfo->name, seqlen, checksum);
+strcpy(endstr, "\n//");
+break;
+/* SRE: Don had Zuker default to Pearson, which is not
+			   intuitive or helpful, since Zuker's MFOLD can't read
+			   Pearson format. More useful to use kIG */
+case SQFILE_ZUKER:
+which_case = 1;			/* MFOLD requires upper case. */
+/*FALLTHRU*/
+case SQFILE_IG:
+fprintf(outf,";%s %s\n",
+	    sqinfo->name,
+	    (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "");
+fprintf(outf,"%s\n", sqinfo->name);
+strcpy(endstr,"1"); /* == linear dna */
+break;
+case SQFILE_RAW:			/* Raw: no header at all. */
+break;
+default :
+case SQFILE_FASTA:
+fprintf(outf,">%s %s\n", sqinfo->name,
+	    (sqinfo->flags & SQINFO_DESC)  ? sqinfo->desc   : "");
+break;
+}
+if (which_case == 1) s2upper(seq);
+if (which_case == 2) s2lower(seq);
+width = MIN(width,100);
+for (i=0, l=0, ibase = 1, lines = 0; i < seqlen; ) {
+if (l1 < 0) l1 = 0;
+else if (l1 == 0) {
+if (numline) fprintf(outf,"%8d ",ibase);
+for (j=0; j<tab; j++) fputc(' ',outf);
+}
+if ((spacer != 0) && ((l+1) % spacer == 1))
+{ s[l] = ' '; ss[l] = ' '; l++; }
+s[l]  = seq[i];
+ss[l] = (sqinfo->flags & SQINFO_SS) ? sqinfo->ss[i] : '.';
+l++; i++;
+l1++;                 /* don't count spaces for width*/
+if (l1 == width || i == seqlen) {
+s[l] = ss[l] = '\0';
+l = 0; l1 = 0;
+if (dostruc)
+	{
+	  fprintf(outf, "%s\n", s);
+	  if (numline) fprintf(outf,"         ");
+	  for (j=0; j<tab; j++) fputc(' ',outf);
+	  if (i == seqlen) fprintf(outf,"%s%s\n",ss,endstr);
+	  else fprintf(outf,"%s\n",ss);
+	}
+else
+	{
+	  if (i == seqlen) fprintf(outf,"%s%s\n",s,endstr);
+	  else fprintf(outf,"%s\n",s);
+	}
+lines++;
+ibase = i+1;
+}
+}
+return lines;
+}
+/* Function: ReadMultipleRseqs()
+*
+* Purpose:  Open a data file and
+*           parse it into an array of rseqs (raw, unaligned
+*           sequences).
+*
+*           Caller is responsible for free'ing memory allocated
+*           to ret_rseqs, ret_weights, and ret_names.
+*
+*           Weights are currently only supported for MSF format.
+*           Sequences read from all other formats will be assigned
+*           weights of 1.0. If the caller isn't interested in
+*           weights, it passes NULL as ret_weights.
+*
+* Returns 1 on success. Returns 0 on failure and sets
+* squid_errno to indicate the cause.
+*/
+int
+ReadMultipleRseqs(char              *seqfile,
+		  int                fformat,
+		  char            ***ret_rseqs,
+		  SQINFO **ret_sqinfo,
+		  int               *ret_num)
+{
+SQINFO *sqinfo;               /* array of sequence optional info         */
+SQFILE *dbfp;                 /* open ptr for sequential access of file  */
+char  **rseqs;                /* sequence array                          */
+int     numalloced;           /* num of seqs currently alloced for       */
+int     num;
+num        = 0;
+numalloced = 16;
+rseqs  = (char **) MallocOrDie (numalloced * sizeof(char *));
+sqinfo = (SQINFO *) MallocOrDie (numalloced * sizeof(SQINFO));
+if ((dbfp = SeqfileOpen(seqfile, fformat, NULL)) == NULL) return 0;
+while (ReadSeq(dbfp, dbfp->format, &rseqs[num], &(sqinfo[num])))
+{
+num++;
+if (num == numalloced) /* more seqs coming, alloc more room */
+	{
+	  numalloced += 16;
+	  rseqs  = (char **) ReallocOrDie (rseqs, numalloced*sizeof(char *));
+	  sqinfo = (SQINFO *) ReallocOrDie (sqinfo, numalloced * sizeof(SQINFO));
+	}
+}
+SeqfileClose(dbfp);
+*ret_rseqs  = rseqs;
+*ret_sqinfo = sqinfo;
+*ret_num    = num;
+return 1;
+}
+/* Function: String2SeqfileFormat()
+* Date:     SRE, Sun Jun 27 15:25:54 1999 [TW 723 over Canadian Shield]
+*
+* Purpose:  Convert a string (e.g. from command line option arg)
+*           to a format code. Case insensitive. Return
+*           MSAFILE_UNKNOWN/SQFILE_UNKNOWN if string is bad.
+*           Uses codes defined in squid.h (unaligned formats) and
+*           msa.h (aligned formats).
+*
+* Args:     s   - string to convert; e.g. "stockholm"
+*
+* Returns:  format code; e.g. MSAFILE_STOCKHOLM
+*/
+int
+String2SeqfileFormat(char *s)
+{
+char *s2;
+int   code = SQFILE_UNKNOWN;
+if (s == NULL) return SQFILE_UNKNOWN;
+s2 = sre_strdup(s, -1);
+s2upper(s2);
+if      (strcmp(s2, "FASTA")     == 0) code = SQFILE_FASTA;
+#ifdef CLUSTALO
+if      (strcmp(s2, "FA")        == 0) code = SQFILE_FASTA;
+else if (strcmp(s2, "VIENNA")    == 0) code = SQFILE_VIENNA;
+else if (strcmp(s2, "VIE")       == 0) code = SQFILE_VIENNA;
+#endif
+else if (strcmp(s2, "GENBANK")   == 0) code = SQFILE_GENBANK;
+#ifdef CLUSTALO
+else if (strcmp(s2, "GB")   == 0) code = SQFILE_GENBANK;
+#endif
+else if (strcmp(s2, "EMBL")      == 0) code = SQFILE_EMBL;
+else if (strcmp(s2, "GCG")       == 0) code = SQFILE_GCG;
+else if (strcmp(s2, "GCGDATA")   == 0) code = SQFILE_GCGDATA;
+else if (strcmp(s2, "RAW")       == 0) code = SQFILE_RAW;
+else if (strcmp(s2, "IG")        == 0) code = SQFILE_IG;
+else if (strcmp(s2, "STRIDER")   == 0) code = SQFILE_STRIDER;
+else if (strcmp(s2, "IDRAW")     == 0) code = SQFILE_IDRAW;
+else if (strcmp(s2, "ZUKER")     == 0) code = SQFILE_ZUKER;
+else if (strcmp(s2, "PIR")       == 0) code = SQFILE_PIR;
+else if (strcmp(s2, "SQUID")     == 0) code = SQFILE_SQUID;
+else if (strcmp(s2, "STOCKHOLM") == 0) code = MSAFILE_STOCKHOLM;
+#ifdef CLUSTALO
+else if (strcmp(s2, "ST") == 0) code = MSAFILE_STOCKHOLM;
+else if (strcmp(s2, "STK") == 0) code = MSAFILE_STOCKHOLM;
+#endif
+else if (strcmp(s2, "SELEX")     == 0) code = MSAFILE_SELEX;
+else if (strcmp(s2, "MSF")       == 0) code = MSAFILE_MSF;
+else if (strcmp(s2, "CLUSTAL")   == 0) code = MSAFILE_CLUSTAL;
+#ifdef CLUSTALO
+else if (strcmp(s2, "CLU")   == 0) code = MSAFILE_CLUSTAL;
+#endif
+else if (strcmp(s2, "A2M")       == 0) code = MSAFILE_A2M;
+else if (strcmp(s2, "PHYLIP")    == 0) code = MSAFILE_PHYLIP;
+#ifdef CLUSTALO
+else if (strcmp(s2, "PHY")    == 0) code = MSAFILE_PHYLIP;
+#endif
+else if (strcmp(s2, "EPS")       == 0) code = MSAFILE_EPS;
+#ifdef CLUSTALO
+else code = SQFILE_UNKNOWN;
+#endif
+free(s2);
+return code;
+}
+char *
+SeqfileFormat2String(int code)
+{
+switch (code) {
+case SQFILE_UNKNOWN:   return "unknown";
+case SQFILE_FASTA:     return "FASTA";
+#ifdef CLUSTALO
+case SQFILE_VIENNA:    return "Vienna";
+#endif
+case SQFILE_GENBANK:   return "Genbank";
+case SQFILE_EMBL:      return "EMBL";
+case SQFILE_GCG:       return "GCG";
+case SQFILE_GCGDATA:   return "GCG data library";
+case SQFILE_RAW:       return "raw";
+case SQFILE_IG:        return "Intelligenetics";
+case SQFILE_STRIDER:   return "MacStrider";
+case SQFILE_IDRAW:     return "Idraw Postscript";
+case SQFILE_ZUKER:     return "Zuker";
+case SQFILE_PIR:       return "PIR";
+case SQFILE_SQUID:     return "SQUID";
+case MSAFILE_STOCKHOLM: return "Stockholm";
+case MSAFILE_SELEX:     return "SELEX";
+case MSAFILE_MSF:       return "MSF";
+case MSAFILE_CLUSTAL:   return "Clustal";
+case MSAFILE_A2M:       return "a2m";
+case MSAFILE_PHYLIP:    return "Phylip";
+case MSAFILE_EPS:       return "EPS";
+default:
+Die("Bad code passed to MSAFormat2String()");
+}
+/*NOTREACHED*/
+return NULL;
+}
+/* Function: MSAToSqinfo()
+* Date:     SRE, Tue Jul 20 14:36:56 1999 [St. Louis]
+*
+* Purpose:  Take an MSA and generate a SQINFO array suitable
+*           for use in annotating the unaligned sequences.
+*           Return the array.
+*
+*           Permanent temporary code. sqinfo was poorly designed.
+*           it must eventually be replaced, but the odds
+*           of this happening soon are nil, so I have to deal.
+*
+* Args:     msa   - the alignment
+*
+* Returns:  ptr to allocated sqinfo array.
+*           Freeing is ghastly: free in each individual sqinfo[i]
+*           with FreeSequence(NULL, &(sqinfo[i])), then
+*           free(sqinfo).
+*/
+SQINFO *
+MSAToSqinfo(MSA *msa)
+{
+int     idx;
+SQINFO *sqinfo;
+sqinfo = MallocOrDie(sizeof(SQINFO) * msa->nseq);
+for (idx = 0; idx < msa->nseq; idx++)
+{
+sqinfo[idx].flags = 0;
+SetSeqinfoString(&(sqinfo[idx]),
+		       msa->sqname[idx],                 SQINFO_NAME);
+SetSeqinfoString(&(sqinfo[idx]),
+		       MSAGetSeqAccession(msa, idx),     SQINFO_ACC);
+SetSeqinfoString(&(sqinfo[idx]),
+		       MSAGetSeqDescription(msa, idx),   SQINFO_DESC);
+if (msa->ss != NULL && msa->ss[idx] != NULL) {
+	MakeDealignedString(msa->aseq[idx], msa->alen,
+			    msa->ss[idx], &(sqinfo[idx].ss));
+	sqinfo[idx].flags |= SQINFO_SS;
+}
+if (msa->sa != NULL && msa->sa[idx] != NULL) {
+	MakeDealignedString(msa->aseq[idx], msa->alen,
+			    msa->sa[idx], &(sqinfo[idx].sa));
+	sqinfo[idx].flags |= SQINFO_SA;
+}
+sqinfo[idx].len    = DealignedLength(msa->aseq[idx]);
+sqinfo[idx].flags |= SQINFO_LEN;
+}
+return sqinfo;
+}
+/* cc -o sqio_test -DA_QUIET_DAY -L. sqio.c -lsquid */
+#ifdef A_QUIET_DAY
+#include "ssi.h"
+int
+main(int argc, char **argv)
+{
+FILE *fp;
+char *filename;
+char *buf;
+int   len;
+int   mode = 3;
+SSIOFFSET off;
+filename = argv[1];
+if (mode == 1) {
+buf = malloc(sizeof(char) * 256);
+if ((fp = fopen(filename, "r")) == NULL)
+Die("open of %s failed", filename);
+while (fgets(buf, 255, fp) != NULL)
+;
+fclose(fp);
+free(buf);
+} else if (mode == 2) {
+if ((fp = fopen(filename, "r")) == NULL)
+Die("open of %s failed", filename);
+buf = NULL; len = 0;
+while (sre_fgets(&buf, &len, fp) != NULL)
+SSIGetFilePosition(fp, SSI_OFFSET_I32, &off);
+fclose(fp);
+free(buf);
+} else if (mode == 3) {
+SQFILE *dbfp;
+SQINFO  info;
+if ((dbfp = SeqfileOpen(filename, SQFILE_FASTA, NULL)) == NULL)
+Die("open of %s failed", filename);
+while (ReadSeq(dbfp, dbfp->format, &buf, &info)) {
+SSIGetFilePosition(dbfp->f, SSI_OFFSET_I32, &off);
+FreeSequence(buf, &info);
+}
+SeqfileClose(dbfp);
+}
+}
+#endif

Mercurial > repos > clustalomega > clustalomega

comparison clustalomega/clustal-omega-1.0.2/src/squid/sqio.c @ 1:bc707542e5de