Mercurial > repos > dawe > srf2fastq
diff srf2fastq/io_lib-1.12.2/io_lib/open_trace_file.c @ 0:d901c9f41a6a default tip
Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author | dawe |
---|---|
date | Tue, 07 Jun 2011 17:48:05 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/srf2fastq/io_lib-1.12.2/io_lib/open_trace_file.c Tue Jun 07 17:48:05 2011 -0400 @@ -0,0 +1,1151 @@ +#ifdef HAVE_CONFIG_H +# include "io_lib_config.h" +#endif + +#if !(defined(_MSC_VER) || defined(__MINGW32__)) +# define TRACE_ARCHIVE +# ifndef HAVE_LIBCURL +# define USE_WGET +# endif +#endif + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <ctype.h> +#include <limits.h> +#include "io_lib/os.h" +#ifdef TRACE_ARCHIVE +# include <sys/socket.h> +# include <netinet/in.h> +# include <netdb.h> +# include <sys/time.h> +# include <errno.h> +#endif +#ifdef USE_WGET +# include <sys/wait.h> +#endif +#ifndef PATH_MAX +# define PATH_MAX 1024 +#endif +#ifdef HAVE_LIBCURL +# include <curl/curl.h> +#endif + +#include "io_lib/open_trace_file.h" +#include "io_lib/misc.h" +#include "io_lib/tar_format.h" +#include "io_lib/compress.h" +#include "io_lib/hash_table.h" +#include "io_lib/sff.h" +#include "io_lib/srf.h" + +/* + * Supported compression extensions. See the magics array in compress.c for + * the full structure. + */ +static char *magics[] = {"", ".bz", ".gz", ".Z", ".z", ".bz2", ".sz"}; + +/* + * Tokenises the search path splitting on colons (unix) or semicolons (windows). + * We also explicitly add a "./" to the end of the search path + * + * Returns: A new search path with items separated by nul chars. Two nul + * chars in a row represent the end of the tokenised path. + * Returns NULL for a failure. + * + * The returned data has been malloced. It is up to the caller to free this + * memory. + */ +static char *tokenise_search_path(char *searchpath) { + char *newsearch; + unsigned int i, j; + size_t len; +#ifdef _WIN32 + char path_sep = ';'; +#else + char path_sep = ':'; +#endif + + if (!searchpath) + searchpath=""; + + newsearch = (char *)malloc((len = strlen(searchpath))+5); + if (!newsearch) + return NULL; + + for (i = 0, j = 0; i < len; i++) { + /* "::" => ":". Used for escaping colons in http://foo */ + if (i < len-1 && searchpath[i] == ':' && searchpath[i+1] == ':') { + newsearch[j++] = ':'; + i++; + continue; + } + + if (searchpath[i] == path_sep) { + /* Skip blank path components */ + if (j && newsearch[j-1] != 0) + newsearch[j++] = 0; + } else { + newsearch[j++] = searchpath[i]; + } + } + + if (j) + newsearch[j++] = 0; + newsearch[j++] = '.'; + newsearch[j++] = '/'; + newsearch[j++] = 0; + newsearch[j++] = 0; + + return newsearch; +} + +/* + * Searches for file in the tar pointed to by tarname. If it finds it, it + * copies it out and returns a file pointer to the temporary file, + * otherwise we return NULL. + * + * If 'tarname'.index exists we will use this as a fast lookup method, + * otherwise we just do a sequential search through the tar. + * + * Offset specifies a starting search position. Set this to zero if you want + * to search through the entire tar file, otherwise set it to the byte offset + * into the file of the tar header block for the desired file to extract. + * (Note that the tar index file overrides this value.) + * + * Returns mFILE pointer if found + * NULL if not. + */ +static mFILE *find_file_tar(char *file, char *tarname, size_t offset) { + int num_magics = sizeof(magics) / sizeof(*magics); + char path[PATH_MAX+101]; + FILE *fp; + tar_block blk; + int size; + int name_len = strlen(file); + + /* Maximum name length for a tar file */ + if (name_len > 100) + return NULL; + + /* Search the .index file */ + sprintf(path, "%s.index", tarname); + if (file_exists(path)) { + FILE *fpind = fopen(path, "r"); + char *cp; + int tmp_off; + int found = 0; + + if (fpind) { + while (fgets(path, PATH_MAX+100, fpind)) { + if (cp = strchr(path, '\n')) + *cp = 0; + tmp_off = strtol(path, &cp, 10); + while (isspace(*cp)) + cp++; + if (strncmp(cp, file, name_len) == 0) { + int i; + for (i = 0; i < num_magics; i++) { + if (strcmp(&cp[name_len], magics[i]) == 0) { + offset = tmp_off; + found = 1; + break; + } + } + if (found) + break; + } + } + fclose(fpind); + + /* Not in index */ + if (!found) + return NULL; + } + } + + if (NULL == (fp = fopen(tarname, "rb"))) + return NULL; + + /* + * Search through the tar file (starting from index position) looking + * for our filename. If there was no index then we start from position 0. + */ + fseek(fp, offset, SEEK_SET); + while(fread(&blk, sizeof(blk), 1, fp) == 1) { + if (!blk.header.name[0]) + break; + + size = strtol(blk.header.size, NULL, 8); + + /* start with the same name... */ + if (strncmp(blk.header.name, file, name_len) == 0) { + char *data; + int i; + + /* ... but does it end with a known compression extension? */ + for (i = 0; i < num_magics; i++) { + if (strcmp(&blk.header.name[name_len], magics[i]) == 0) { + break; + } + } + /* ... apparently not? continue then */ + if (i == num_magics) + continue; + + /* Found it - copy out the data to an mFILE */ + if (NULL == (data = (char *)malloc(size))) + return NULL; + if (size != fread(data, 1, size, fp)) { + free(data); + return NULL; + } + return mfcreate(data, size); + } + + fseek(fp, TBLOCK*((size+TBLOCK-1)/TBLOCK), SEEK_CUR); + } + + fclose(fp); + return NULL; +} + +/* + * Reads a hash file to look for a filename. The hash file contains the + * (relative) pathname for the file it is an index for along with the + * positions and sizes of each file contained within it. The file format + * of the archive itself is irrelevant provided that the data is not + * internally compressed in some manner specific to that archive. + * + * Return mFILE pointer if found + * NULL if not + */ +static mFILE *find_file_hash(char *file, char *hashfile) { + size_t size; + static HashFile *hf = NULL; + static char hf_name[1024]; + char *data; + + /* Cache an open HashFile for fast accesing */ + if (strcmp(hashfile, hf_name) != 0) { + if (hf) + HashFileDestroy(hf); + hf = HashFileOpen(hashfile); + + if (!hf) + return NULL; + strcpy(hf_name, hashfile); + } + + /* Search */ + if (NULL == (data = HashFileExtract(hf, file, &size))) + return NULL; + + /* Found, so copy the contents to a fake FILE pointer */ + return mfcreate(data, size); +} + +/* + * Extracts a single trace from an SRF file. + * + * Return mFILE pointer if found + * NULL if not + */ +static mFILE *find_file_srf(char *tname, char *srffile) { + srf_t *srf; + uint64_t cpos, hpos, dpos; + mFILE *mf = NULL; + char *cp; + + if (NULL == (srf = srf_open(srffile, "r"))) + return NULL; + + if (NULL != (cp = strrchr(tname, '/'))) + tname = cp+1; + + if (0 == srf_find_trace(srf, tname, &cpos, &hpos, &dpos)) { + char *data = malloc(srf->th.trace_hdr_size + srf->tb.trace_size); + if (!data) { + srf_destroy(srf, 1); + return NULL; + } + memcpy(data, srf->th.trace_hdr, srf->th.trace_hdr_size); + memcpy(data + srf->th.trace_hdr_size, + srf->tb.trace, srf->tb.trace_size); + mf = mfcreate(data, srf->th.trace_hdr_size + srf->tb.trace_size); + } + + srf_destroy(srf, 1); + return mf; +} + +#ifdef TRACE_ARCHIVE +/* + * Searches for file in the ensembl trace archive pointed to by arcname. + * If it finds it, it copies it out and returns a file pointer to the + * temporary file, otherwise we return NULL. + * + * Arcname has the form address:port, eg "titan/22100" + * + * Returns mFILE pointer if found + * NULL if not. + */ +#define RDBUFSZ 8192 +static mFILE *find_file_archive(char *file, char *arcname) { + char server[1024], *cp; + int port; + struct hostent *host; + struct sockaddr_in saddr; + int s = 0; + char msg[1024]; + ssize_t msg_len; + char buf[RDBUFSZ]; + mFILE *fpout; + int block_count; + + /* Split arc name into server and port */ + if (!(cp = strchr(arcname, '/'))) + return NULL; + strncpy(server, arcname, 1023); + server[MIN(1023,cp-arcname)] = 0; + port = atoi(cp+1); + + /* Make and connect socket */ + if (NULL == (host = gethostbyname(server))) { + perror("gethostbyname()"); + return NULL; + } + saddr.sin_port = htons(port); + saddr.sin_family = host->h_addrtype; + memcpy(&saddr.sin_addr,host->h_addr_list[0], host->h_length); + if ((s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == -1) { + perror("socket()"); + return NULL; + } + if (connect(s, (struct sockaddr *)&saddr, sizeof(saddr)) == -1) { + perror("connect()"); + return NULL; + } + + /* The minimal message to send down is "--scf tracename" */ + sprintf(msg, "--scf %.*s\n", 1000, file); + msg_len = strlen(msg); + if (send(s, msg, msg_len, 0) != msg_len) { + /* + * partial request sent, but requests are short so if this + * happens it's unlikely we'll cure it by sending multiple + * fragments. + */ + /* close(s); */ + return NULL; + } + + /* + * Create a fake FILE (mFILE) and write to it. + */ + fpout = mfcreate(NULL, 0); + + /* + * Read the data back, in multiple blocks if necessary and write it + * to our temporary file. We use a blocking read with a low timeout to + * prevent locking up the application indefinitely. + */ + { + struct timeval tv = {0, 10000}; + setsockopt(s, SOL_SOCKET, SO_RCVTIMEO, (char *)&tv, sizeof(tv)); + } + errno = 0; + block_count = 200; + while ((msg_len = read(s, buf, RDBUFSZ)) > 0 || + (errno == EWOULDBLOCK && --block_count)) { + errno = 0; + if (msg_len > 0) + mfwrite(buf, 1, msg_len, fpout); + } + close(s); + + if (!block_count) { + mfclose(fpout); + return NULL; + } + + mrewind(fpout); + + return fpout; +} +#endif + +#ifdef USE_WGET +/* NB: non-reentrant due to reuse of handle */ +static mFILE *find_file_url(char *file, char *url) { + char buf[8192], *cp; + mFILE *fp; + int pid; + int maxlen = 8190 - strlen(file); + char *fname = tempnam(NULL, NULL); + int status; + + /* Expand %s for the trace name */ + for (cp = buf; *url && cp - buf < maxlen; url++) { + if (*url == '%' && *(url+1) == 's') { + url++; + cp += strlen(strcpy(cp, file)); + } else { + *cp++ = *url; + } + } + *cp++ = 0; + + /* Execute wget */ + if ((pid = fork())) { + waitpid(pid, &status, 0); + } else { + execlp("wget", "wget", "-q", "-O", fname, buf, NULL); + } + + /* Return a filepointer to the result (if it exists) */ + fp = (!status && file_size(fname) != 0) ? mfopen(fname, "rb") : NULL; + remove(fname); + free(fname); + + return fp; +} +#endif + +#ifdef HAVE_LIBCURL +static mFILE *find_file_url(char *file, char *url) { + char buf[8192], *cp; + mFILE *mf = NULL, *headers = NULL; + int maxlen = 8190 - strlen(file); + static CURL *handle = NULL; + static int curl_init = 0; + char errbuf[CURL_ERROR_SIZE]; + + *errbuf = 0; + + if (!curl_init) { + if (curl_global_init(CURL_GLOBAL_ALL)) + return NULL; + + if (NULL == (handle = curl_easy_init())) + goto error; + + curl_init = 1; + } + + /* Expand %s for the trace name */ + for (cp = buf; *url && cp - buf < maxlen; url++) { + if (*url == '%' && *(url+1) == 's') { + url++; + cp += strlen(strcpy(cp, file)); + } else { + *cp++ = *url; + } + } + *cp++ = 0; + + /* Setup the curl */ + if (NULL == (mf = mfcreate(NULL, 0)) || + NULL == (headers = mfcreate(NULL, 0))) + return NULL; + + if (0 != curl_easy_setopt(handle, CURLOPT_URL, buf)) + goto error; + if (0 != curl_easy_setopt(handle, CURLOPT_TIMEOUT, 10L)) + goto error; + if (0 != curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, mfwrite)) + goto error; + if (0 != curl_easy_setopt(handle, CURLOPT_WRITEDATA, mf)) + goto error; + if (0 != curl_easy_setopt(handle, CURLOPT_HEADERFUNCTION, mfwrite)) + goto error; + if (0 != curl_easy_setopt(handle, CURLOPT_WRITEHEADER, headers)) + goto error; + if (0 != curl_easy_setopt(handle, CURLOPT_ERRORBUFFER, errbuf)) + goto error; + + /* Fetch! */ + if (0 != curl_easy_perform(handle)) + goto error; + + /* Report errors is approproate. 404 is silent as it may have just been + * a search via RAWDATA path, everything else is worth reporting. + */ + { + float version; + int response; + char nul = 0; + mfwrite(&nul, 1, 1, headers); + if (2 == sscanf(headers->data, "HTTP/%f %d", &version, &response)) { + if (response != 200) { + if (response != 404) + fprintf(stderr, "%.*s\n", + (int)headers->size, headers->data); + goto error; + } + } + } + + if (mftell(mf) == 0) + goto error; + + mfdestroy(headers); + + mrewind(mf); + return mf; + + error: + if (mf) + mfdestroy(mf); + if (headers) + mfdestroy(headers); + if (*errbuf) + fprintf(stderr, "%s\n", errbuf); + return NULL; +} +#endif + +/* + * Takes an SFF file in 'data' and edits the header to ensure + * that it has no index listed and only claims to contain a single entry. + * This isn't strictly necessary for the sff/sff.c reading code, but it is + * the 'Right Thing' to do. + * + * Returns an mFILE on success or NULL on failure. + */ +static mFILE *sff_single(char *data, size_t size) { + *(uint64_t *)(data+8) = be_int8(0); /* index offset */ + *(uint32_t *)(data+16) = be_int4(0); /* index size */ + *(uint32_t *)(data+20) = be_int4(1); /* number of reads */ + + return mfcreate(data, size); +} + +/* Hash (.hsh) format index searching for SFF files */ +static mFILE *sff_hash_query(char *sff, char *entry, FILE *fp) { + static HashFile *hf = NULL; + static char sff_copy[1024]; + static FILE *fp_copy = NULL; + char *data; + size_t size; + + /* Cache an open HashFile for fast accessing */ + if (strcmp(sff, sff_copy) != 0) { + if (hf) { + hf->afp = hf->hfp = NULL; /* will be closed by our parent */ + HashFileDestroy(hf); + } + fseek(fp, -4, SEEK_CUR); + if (NULL == (hf = HashFileFopen(fp))) + return NULL; + + strcpy(sff_copy, sff); + fp_copy = fp; + } + + data = HashFileExtract(hf, entry, &size); + + return data ? sff_single(data, size) : NULL; +} + + +/* + * getuint4_255 + * + * A function to convert a 4-byte TVF/SFF value into an integer, where + * the bytes are base 255 numbers. This is used to store the index offsets. + */ +static unsigned int getuint4_255(unsigned char *b) +{ + return + ((unsigned int) b[0]) * 255 * 255 * 255 + + ((unsigned int) b[1]) * 255 * 255 + + ((unsigned int) b[2]) * 255 + + ((unsigned int) b[3]); +} + +/* + * 454 sorted format (.srt) index searching for SFF files. + * Uses a binary search. + * This function and getuint4_255 above are taken with permission + * from 454's getsff.c with the following licence: + * + * Copyright (c)[2001-2005] 454 Life Sciences Corporation. All Rights Reserved. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * + * IN NO EVENT SHALL LICENSOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE. + * + * Permission to use, copy, modify and distribute this software and its + * documentation for any purpose is hereby granted without fee, provided + * that this copyright and notice appears in all copies. + */ +static mFILE *sff_sorted_query(char *sff, char *accno, FILE *fp, + uint32_t index_length) { + static unsigned char *index; + static char sff_copy[1024]; + unsigned char *us; + uint32_t start, end; + uint32_t offset; + char *data = NULL; + static char chdr[1024]; + static int chdrlen = 0, nflows = 0; + char rhdr[1024]; + int rhdrlen; + int nbases, dlen; + int bytes_per_flow = 2; + + /* Cache index if we're querying the same SFF file */ + if (strcmp(sff_copy, sff) != 0) { + if (index) + xfree(index); + if (NULL == (index = (unsigned char *)xmalloc(index_length))) + return NULL; + + if (index_length != fread(index, 1, index_length, fp)) { + xfree(index); + return NULL; + } + strcpy(sff_copy, sff); + + /* Read the common header too - minimal decoding necessary */ + fseek(fp, 0, SEEK_SET); + if (31 != fread(chdr, 1, 31, fp)) + return NULL; + chdrlen = be_int2(*(uint16_t *)(chdr+24)); + nflows = be_int2(*(uint16_t *)(chdr+28)); + if (chdrlen-31 != fread(chdr+31, 1, chdrlen-31, fp)) + return NULL; + } + + /* + * Perform a binary search of the index, stopping when the search + * region becomes relatively small. This assumes that no accession + * number is near 200 characters. + */ + start = 0; + end = index_length; + while (end - start > 200) { + uint32_t mid; + int val; + mid = (start + end) / 2; + + /* + * From the byte midpoint, scan backwards to the beginning of the + * index record that covers that byte midpoint. + */ + while (mid > start && index[mid-1] != 255) { + mid--; + } + val = strcmp(accno, (char *)(index+mid)); + + if (val == 0) { + break; + } else if (val < 0) { + end = mid; + } else { + start = mid; + } + } + + /* + * Scan through the small search region, looking for the accno. + */ + while (start < end) { + if (strcmp(accno, (char *)(index+start)) == 0) { + /* + * If the accno is found, skip the accno characters, + * then get the record offset. + */ + for (us=index+start; *us; us++,start++) ; + us++; + start++; + + offset = getuint4_255(us); + if (us[4] != 255) { + return NULL; + } + + /* + * The original getsff.c here computed the record size by + * looking at the next index item and comparing it's offset to + * this one, or the end of file position if this is the last + * item. This has two problems: + * 1: It means the index itself cannot be added to the end of + * the file. + * 2: It means that we cannot simply add an index to a SFF + * file without also reordering all of the items within it. + * + * We solve this by reading the read header to work out the + * object size instead. + */ + break; + } + + /* + * Skip to the beginning of the next index element. + */ + while (start < end && index[start] != 255) { + start++; + } + start++; + } + + /* + * Now offset indicates the position of the SFF entry. Read and decode + * header to get data length. Then read this too. + */ + fseek(fp, offset, SEEK_SET); + if (16 != fread(rhdr, 1, 16, fp)) + return NULL; + + rhdrlen = be_int2(*(uint16_t *)rhdr); + nbases = be_int4(*(uint32_t *)(rhdr+4)); + + if (rhdrlen-16 != fread(rhdr+16, 1, rhdrlen-16, fp)) + return NULL; + dlen = (nflows * bytes_per_flow + nbases * 3 + 7) & ~7; + + /* Built up the fake SFF entry */ + if (NULL == (data = (char *)xmalloc(chdrlen + rhdrlen + dlen))) + return NULL; + + memcpy(data, chdr, chdrlen); + memcpy(data + chdrlen, rhdr, rhdrlen); + if (dlen != fread(data + chdrlen + rhdrlen, 1, dlen, fp)) { + xfree(data); + return NULL; + } + + /* Convert to mFILE */ + return sff_single(data, chdrlen + rhdrlen + dlen); +} + + +/* + * This returns an mFILE containing an SFF entry. + * + * This does the minimal decoding necessary to skip through the SFF + * container to find an entry. In this respect it is a semi-duplication + * of sff/sff.[ch], but implemented for efficiency. + * + * Having found an entry it packs the common header, the read specific + * header and the read data into a single block of memory and returns this + * as an mFILE. In essence it produces a single-read SFF archive. This + * is then decoded by the normal sff parsing code representing a small + * amount of redundancy, but one which is swamped by the I/O time. + */ +static mFILE *find_file_sff(char *entry, char *sff) { + static FILE *fp = NULL; + static char sff_copy[1024]; + char chdr[65536], rhdr[65536]; /* generous, but worst case */ + uint32_t nkey, nflows, chdrlen, rhdrlen, dlen, magic; + uint64_t file_pos; + static uint64_t index_offset = 0; + static uint32_t index_length = 0; + static char index_format[8]; + uint32_t nreads, i; + size_t entry_len = strlen(entry); + int bytes_per_flow = 2; + char *fake_file; + + /* + * Check cached information so rapid queries to the same archive are + * fast. + * ASSUMPTION: we won't externally replace the sff file with another of + * the same name. + */ + if (strcmp(sff, sff_copy) == 0) { + if (memcmp(index_format, ".hsh1.00", 8) == 0) { + return sff_hash_query(sff, entry, fp); + } else if (memcmp(index_format, ".srt1.00", 8) == 0) { + return sff_sorted_query(sff, entry, fp, index_length-8); + } + } + + if (fp) + fclose(fp); + + strcpy(sff_copy, sff); + *index_format = 0; + + + /* Read the common header */ + if (NULL == (fp = fopen(sff, "rb"))) + return NULL; + if (31 != fread(chdr, 1, 31, fp)) + return NULL; + + /* Check magic & vers: TODO */ + magic = be_int4(*(uint32_t *)chdr); + if (magic != SFF_MAGIC) + return NULL; + if (memcmp(chdr+4, SFF_VERSION, 4) != 0) + return NULL; + + /* If we have an index, use it, otherwise search linearly */ + index_offset = be_int8(*(uint64_t *)(chdr+8)); + index_length = be_int4(*(uint32_t *)(chdr+16)); + if (index_length != 0) { + long orig_pos = ftell(fp); + fseek(fp, index_offset, SEEK_SET); + fread(index_format, 1, 8, fp); + + if (memcmp(index_format, ".hsh1.00", 8) == 0) { + /* HASH index v1.00 */ + return sff_hash_query(sff, entry, fp); + + } else if (memcmp(index_format, ".srt1.00", 8) == 0) { + /* 454 sorted v1.00 */ + return sff_sorted_query(sff, entry, fp, index_length-8); + } else { + /* Unknown index: revert back to a slow linear scan */ + fseek(fp, orig_pos, SEEK_SET); + } + } + + nreads = be_int4(*(uint32_t *)(chdr+20)); + chdrlen = be_int2(*(uint16_t *)(chdr+24)); + nkey = be_int2(*(uint16_t *)(chdr+26)); + nflows = be_int2(*(uint16_t *)(chdr+28)); + + /* Read the remainder of the header */ + if (chdrlen-31 != fread(chdr+31, 1, chdrlen-31, fp)) + return NULL; + + file_pos = chdrlen; + + /* Loop until we find the correct entry */ + for (i = 0; i < nreads; i++) { + uint16_t name_len; + uint32_t nbases; + + /* Index could be between common header and first read - skip */ + if (file_pos == index_offset) { + fseek(fp, index_length, SEEK_CUR); + file_pos += index_length; + } + + /* Read 16 bytes to get name length */ + if (16 != fread(rhdr, 1, 16, fp)) + return NULL; + rhdrlen = be_int2(*(uint16_t *)rhdr); + name_len = be_int2(*(uint16_t *)(rhdr+2)); + nbases = be_int4(*(uint32_t *)(rhdr+4)); + + /* Read the rest of the header */ + if (rhdrlen-16 != fread(rhdr+16, 1, rhdrlen-16, fp)) + return NULL; + + file_pos += rhdrlen; + + dlen = (nflows * bytes_per_flow + nbases * 3 + 7) & ~7; + + if (name_len == entry_len && 0 == memcmp(rhdr+16, entry, entry_len)) + break; + + /* This is not the read you are looking for... */ + fseek(fp, dlen, SEEK_CUR); + } + + if (i == nreads) { + /* Not found */ + return NULL; + } + + /* + * Although we've decoded some bits already, we take the more modular + * approach of packing the sections together and passing the entire + * data structure off as a single-read SFF file to be decoded fully + * by the sff reading code. + */ + if (NULL == (fake_file = (char *)xmalloc(chdrlen + rhdrlen + dlen))) + return NULL; + + memcpy(fake_file, chdr, chdrlen); + memcpy(fake_file+chdrlen, rhdr, rhdrlen); + if (dlen != fread(fake_file+chdrlen+rhdrlen, 1, dlen, fp)) { + xfree(fake_file); + return NULL; + } + + /* Convert to an mFILE and return */ + return sff_single(fake_file, chdrlen+rhdrlen+dlen); +} + +/* + * Searches for file in the directory 'dirname'. If it finds it, it opens + * it. This also searches for compressed versions of the file in dirname + * too. + * + * Returns mFILE pointer if found + * NULL if not + */ +static mFILE *find_file_dir(char *file, char *dirname) { + char path[PATH_MAX+1], path2[PATH_MAX+1]; + size_t len = strlen(dirname); + char *cp; + + if (dirname[len-1] == '/') + len--; + + /* Special case for "./" or absolute filenames */ + if (*file == '/' || (len==1 && *dirname == '.')) + sprintf(path, "%s", file); + else + sprintf(path, "%.*s/%s", (int)len, dirname, file); + + if (is_file(path)) { + return mfopen(path, "rb"); + } + + /* + * Given a pathname /a/b/c if a/b is a file and not a directory then + * we'd get an ENOTDIR error. Instead we assume that a/b is an archive + * and we attempt to work out what type by reading the first and last + * bits of the file. + */ + if (cp = strrchr(file, '/')) { + strcpy(path2, path); /* path contains / too as it's from file */ + *strrchr(path2, '/') = 0; + + if (is_file(path2)) { + /* Open the archive to test for magic numbers */ + char magic[8]; + FILE *fp; + enum archive_type_t { + NONE, HASH, TAR, SFF, SRF + } type = NONE; + + if (NULL == (fp = fopen(path2, "rb"))) + return NULL; + memcpy(magic, "\0\0\0\0\0\0", 4); + fread(magic, 1, 4, fp); + + /* .hsh or .sff at start */ + if (memcmp(magic, ".hsh", 4) == 0) + type = HASH; + else if (memcmp(magic, ".sff", 4) == 0) + type = SFF; + + /* Or .hsh or Ihsh at the end */ + if (NONE == type) { + fseek(fp, -16, SEEK_END); + fread(magic, 1, 8, fp); + if (memcmp(magic+4, ".hsh", 4) == 0) + type = HASH; + else if (memcmp(magic, "Ihsh", 4) == 0) + type = SRF; + } + + /* or ustar 257 bytes in to indicate un-hashed tar */ + if (NONE == type) { + fseek(fp, 257, SEEK_SET); + fread(magic, 1, 5, fp); + if (memcmp(magic, "ustar", 5) == 0) + type = TAR; + } + fclose(fp); + + switch (type) { + case HASH: + return find_file_hash(cp+1, path2); + case TAR: + return find_file_tar(cp+1, path2, 0); + case SFF: + return find_file_sff(cp+1, path2); + case SRF: + return find_file_srf(cp+1, path2); + case NONE: + break; + } + + return NULL; + } + } + + return NULL; +} + +/* + * ------------------------------------------------------------------------ + * Public functions below. + */ + +/* + * Opens a trace file named 'file'. This is initially looked for as a + * pathname relative to a file named "relative_to". This may (for + * example) be the name of an experiment file referencing the trace + * file. In this case by passing relative_to as the experiment file + * filename the trace file will be picked up in the same directory as + * the experiment file. Relative_to may be supplied as NULL. + * + * 'file' is looked for at relative_to, then the current directory, and then + * all of the locations listed in 'path' (which is a colon separated list). + * If 'path' is NULL it uses the RAWDATA environment variable instead. + * + * Returns a mFILE pointer when found. + * NULL otherwise. + */ +mFILE *open_path_mfile(char *file, char *path, char *relative_to) { + char *newsearch; + char *ele; + mFILE *fp; + + /* Use path first */ + if (!path) + path = getenv("RAWDATA"); + if (NULL == (newsearch = tokenise_search_path(path))) + return NULL; + + /* + * Step through the search path testing out each component. + * We now look through each path element treating some prefixes as + * special, otherwise we treat the element as a directory. + */ + for (ele = newsearch; *ele; ele += strlen(ele)+1) { + int i; + char *suffix[6] = {"", ".gz", ".bz2", ".sz", ".Z", ".bz2"}; + for (i = 0; i < 6; i++) { + char file2[1024]; + char *ele2; + int valid = 1; + + /* + * '|' prefixing a path component indicates that we do not + * wish to perform the compression extension searching in that + * location. + */ + if (*ele == '|') { + ele2 = ele+1; + valid = (i == 0); + } else { + ele2 = ele; + } + + sprintf(file2, "%s%s", file, suffix[i]); + + if (0 == strncmp(ele2, "TAR=", 4)) { + if (valid && (fp = find_file_tar(file2, ele2+4, 0))) { + free(newsearch); + return fp; + } + + } else if (0 == strncmp(ele2, "HASH=", 5)) { + if (valid && (fp = find_file_hash(file2, ele2+5))) { + free(newsearch); + return fp; + } +#ifdef TRACE_ARCHIVE + } else if (0 == strncmp(ele2, "ARC=", 4)) { + if (valid && (fp = find_file_archive(file2, ele2+4))) { + free(newsearch); + return fp; + } +#endif +#if defined(USE_WGET) || defined(HAVE_LIBCURL) + } else if (0 == strncmp(ele2, "URL=", 4)) { + if (valid && (fp = find_file_url(file2, ele2+4))) { + free(newsearch); + return fp; + } +#endif + } else if (0 == strncmp(ele2, "SFF=", 4)) { + if (valid && (fp = find_file_sff(file2, ele2+4))) { + free(newsearch); + return fp; + } + + } else if (0 == strncmp(ele2, "SRF=", 4)) { + if (valid && (fp = find_file_srf(file2, ele2+4))) { + free(newsearch); + return fp; + } + + } else { + if (valid && (fp = find_file_dir(file2, ele2))) { + free(newsearch); + return fp; + } + } + } + } + + free(newsearch); + + /* Look in the same location as the incoming 'relative_to' filename */ + if (relative_to) { + char *cp; + char relative_path[PATH_MAX+1]; + strcpy(relative_path, relative_to); + if (cp = strrchr(relative_path, '/')) + *cp = 0; + if (fp = find_file_dir(file, relative_path)) + return fp; + } + + return NULL; +} + +FILE *open_path_file(char *file, char *path, char *relative_to) { + mFILE *mf = open_path_mfile(file, path, relative_to); + FILE *fp; + + if (!mf) + return NULL; + + if (mf->fp) + return mf->fp; + + /* Secure temporary file generation */ + if (NULL == (fp = tmpfile())) + return NULL; + + /* Copy the data */ + fwrite(mf->data, 1, mf->size, fp); + rewind(fp); + mfclose(mf); + + return fp; +} + +static char *exp_path = NULL; +static char *trace_path = NULL; + +void iolib_set_trace_path(char *path) { trace_path = path; } +char *iolib_get_trace_path(void) { return trace_path; } +void iolib_set_exp_path (char *path) { exp_path = path; } +char *iolib_get_exp_path (void) { return exp_path; } + +/* + * Trace file functions: uses TRACE_PATH environment variable. + */ +mFILE *open_trace_mfile(char *file, char *rel_to) { + return open_path_mfile(file, trace_path ? trace_path + : getenv("TRACE_PATH"), rel_to); +} + +FILE *open_trace_file(char *file, char *rel_to) { + return open_path_file(file, trace_path ? trace_path + : getenv("TRACE_PATH"), rel_to); +} + +/* + * Trace file functions: uses EXP_PATH environment variable. + */ +mFILE *open_exp_mfile(char *file, char *relative_to) { + return open_path_mfile(file, exp_path ? exp_path + : getenv("EXP_PATH"), relative_to); +} + +FILE *open_exp_file(char *file, char *relative_to) { + return open_path_file(file, exp_path ? exp_path + : getenv("EXP_PATH"), relative_to); +} +