comparison srf2fastq/io_lib-1.12.2/io_lib/open_trace_file.c @ 0:d901c9f41a6a default tip

Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author dawe
date Tue, 07 Jun 2011 17:48:05 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d901c9f41a6a
1 #ifdef HAVE_CONFIG_H
2 # include "io_lib_config.h"
3 #endif
4
5 #if !(defined(_MSC_VER) || defined(__MINGW32__))
6 # define TRACE_ARCHIVE
7 # ifndef HAVE_LIBCURL
8 # define USE_WGET
9 # endif
10 #endif
11
12 #include <stdlib.h>
13 #include <stdio.h>
14 #include <string.h>
15 #include <unistd.h>
16 #include <ctype.h>
17 #include <limits.h>
18 #include "io_lib/os.h"
19 #ifdef TRACE_ARCHIVE
20 # include <sys/socket.h>
21 # include <netinet/in.h>
22 # include <netdb.h>
23 # include <sys/time.h>
24 # include <errno.h>
25 #endif
26 #ifdef USE_WGET
27 # include <sys/wait.h>
28 #endif
29 #ifndef PATH_MAX
30 # define PATH_MAX 1024
31 #endif
32 #ifdef HAVE_LIBCURL
33 # include <curl/curl.h>
34 #endif
35
36 #include "io_lib/open_trace_file.h"
37 #include "io_lib/misc.h"
38 #include "io_lib/tar_format.h"
39 #include "io_lib/compress.h"
40 #include "io_lib/hash_table.h"
41 #include "io_lib/sff.h"
42 #include "io_lib/srf.h"
43
44 /*
45 * Supported compression extensions. See the magics array in compress.c for
46 * the full structure.
47 */
48 static char *magics[] = {"", ".bz", ".gz", ".Z", ".z", ".bz2", ".sz"};
49
50 /*
51 * Tokenises the search path splitting on colons (unix) or semicolons (windows).
52 * We also explicitly add a "./" to the end of the search path
53 *
54 * Returns: A new search path with items separated by nul chars. Two nul
55 * chars in a row represent the end of the tokenised path.
56 * Returns NULL for a failure.
57 *
58 * The returned data has been malloced. It is up to the caller to free this
59 * memory.
60 */
61 static char *tokenise_search_path(char *searchpath) {
62 char *newsearch;
63 unsigned int i, j;
64 size_t len;
65 #ifdef _WIN32
66 char path_sep = ';';
67 #else
68 char path_sep = ':';
69 #endif
70
71 if (!searchpath)
72 searchpath="";
73
74 newsearch = (char *)malloc((len = strlen(searchpath))+5);
75 if (!newsearch)
76 return NULL;
77
78 for (i = 0, j = 0; i < len; i++) {
79 /* "::" => ":". Used for escaping colons in http://foo */
80 if (i < len-1 && searchpath[i] == ':' && searchpath[i+1] == ':') {
81 newsearch[j++] = ':';
82 i++;
83 continue;
84 }
85
86 if (searchpath[i] == path_sep) {
87 /* Skip blank path components */
88 if (j && newsearch[j-1] != 0)
89 newsearch[j++] = 0;
90 } else {
91 newsearch[j++] = searchpath[i];
92 }
93 }
94
95 if (j)
96 newsearch[j++] = 0;
97 newsearch[j++] = '.';
98 newsearch[j++] = '/';
99 newsearch[j++] = 0;
100 newsearch[j++] = 0;
101
102 return newsearch;
103 }
104
105 /*
106 * Searches for file in the tar pointed to by tarname. If it finds it, it
107 * copies it out and returns a file pointer to the temporary file,
108 * otherwise we return NULL.
109 *
110 * If 'tarname'.index exists we will use this as a fast lookup method,
111 * otherwise we just do a sequential search through the tar.
112 *
113 * Offset specifies a starting search position. Set this to zero if you want
114 * to search through the entire tar file, otherwise set it to the byte offset
115 * into the file of the tar header block for the desired file to extract.
116 * (Note that the tar index file overrides this value.)
117 *
118 * Returns mFILE pointer if found
119 * NULL if not.
120 */
121 static mFILE *find_file_tar(char *file, char *tarname, size_t offset) {
122 int num_magics = sizeof(magics) / sizeof(*magics);
123 char path[PATH_MAX+101];
124 FILE *fp;
125 tar_block blk;
126 int size;
127 int name_len = strlen(file);
128
129 /* Maximum name length for a tar file */
130 if (name_len > 100)
131 return NULL;
132
133 /* Search the .index file */
134 sprintf(path, "%s.index", tarname);
135 if (file_exists(path)) {
136 FILE *fpind = fopen(path, "r");
137 char *cp;
138 int tmp_off;
139 int found = 0;
140
141 if (fpind) {
142 while (fgets(path, PATH_MAX+100, fpind)) {
143 if (cp = strchr(path, '\n'))
144 *cp = 0;
145 tmp_off = strtol(path, &cp, 10);
146 while (isspace(*cp))
147 cp++;
148 if (strncmp(cp, file, name_len) == 0) {
149 int i;
150 for (i = 0; i < num_magics; i++) {
151 if (strcmp(&cp[name_len], magics[i]) == 0) {
152 offset = tmp_off;
153 found = 1;
154 break;
155 }
156 }
157 if (found)
158 break;
159 }
160 }
161 fclose(fpind);
162
163 /* Not in index */
164 if (!found)
165 return NULL;
166 }
167 }
168
169 if (NULL == (fp = fopen(tarname, "rb")))
170 return NULL;
171
172 /*
173 * Search through the tar file (starting from index position) looking
174 * for our filename. If there was no index then we start from position 0.
175 */
176 fseek(fp, offset, SEEK_SET);
177 while(fread(&blk, sizeof(blk), 1, fp) == 1) {
178 if (!blk.header.name[0])
179 break;
180
181 size = strtol(blk.header.size, NULL, 8);
182
183 /* start with the same name... */
184 if (strncmp(blk.header.name, file, name_len) == 0) {
185 char *data;
186 int i;
187
188 /* ... but does it end with a known compression extension? */
189 for (i = 0; i < num_magics; i++) {
190 if (strcmp(&blk.header.name[name_len], magics[i]) == 0) {
191 break;
192 }
193 }
194 /* ... apparently not? continue then */
195 if (i == num_magics)
196 continue;
197
198 /* Found it - copy out the data to an mFILE */
199 if (NULL == (data = (char *)malloc(size)))
200 return NULL;
201 if (size != fread(data, 1, size, fp)) {
202 free(data);
203 return NULL;
204 }
205 return mfcreate(data, size);
206 }
207
208 fseek(fp, TBLOCK*((size+TBLOCK-1)/TBLOCK), SEEK_CUR);
209 }
210
211 fclose(fp);
212 return NULL;
213 }
214
215 /*
216 * Reads a hash file to look for a filename. The hash file contains the
217 * (relative) pathname for the file it is an index for along with the
218 * positions and sizes of each file contained within it. The file format
219 * of the archive itself is irrelevant provided that the data is not
220 * internally compressed in some manner specific to that archive.
221 *
222 * Return mFILE pointer if found
223 * NULL if not
224 */
225 static mFILE *find_file_hash(char *file, char *hashfile) {
226 size_t size;
227 static HashFile *hf = NULL;
228 static char hf_name[1024];
229 char *data;
230
231 /* Cache an open HashFile for fast accesing */
232 if (strcmp(hashfile, hf_name) != 0) {
233 if (hf)
234 HashFileDestroy(hf);
235 hf = HashFileOpen(hashfile);
236
237 if (!hf)
238 return NULL;
239 strcpy(hf_name, hashfile);
240 }
241
242 /* Search */
243 if (NULL == (data = HashFileExtract(hf, file, &size)))
244 return NULL;
245
246 /* Found, so copy the contents to a fake FILE pointer */
247 return mfcreate(data, size);
248 }
249
250 /*
251 * Extracts a single trace from an SRF file.
252 *
253 * Return mFILE pointer if found
254 * NULL if not
255 */
256 static mFILE *find_file_srf(char *tname, char *srffile) {
257 srf_t *srf;
258 uint64_t cpos, hpos, dpos;
259 mFILE *mf = NULL;
260 char *cp;
261
262 if (NULL == (srf = srf_open(srffile, "r")))
263 return NULL;
264
265 if (NULL != (cp = strrchr(tname, '/')))
266 tname = cp+1;
267
268 if (0 == srf_find_trace(srf, tname, &cpos, &hpos, &dpos)) {
269 char *data = malloc(srf->th.trace_hdr_size + srf->tb.trace_size);
270 if (!data) {
271 srf_destroy(srf, 1);
272 return NULL;
273 }
274 memcpy(data, srf->th.trace_hdr, srf->th.trace_hdr_size);
275 memcpy(data + srf->th.trace_hdr_size,
276 srf->tb.trace, srf->tb.trace_size);
277 mf = mfcreate(data, srf->th.trace_hdr_size + srf->tb.trace_size);
278 }
279
280 srf_destroy(srf, 1);
281 return mf;
282 }
283
284 #ifdef TRACE_ARCHIVE
285 /*
286 * Searches for file in the ensembl trace archive pointed to by arcname.
287 * If it finds it, it copies it out and returns a file pointer to the
288 * temporary file, otherwise we return NULL.
289 *
290 * Arcname has the form address:port, eg "titan/22100"
291 *
292 * Returns mFILE pointer if found
293 * NULL if not.
294 */
295 #define RDBUFSZ 8192
296 static mFILE *find_file_archive(char *file, char *arcname) {
297 char server[1024], *cp;
298 int port;
299 struct hostent *host;
300 struct sockaddr_in saddr;
301 int s = 0;
302 char msg[1024];
303 ssize_t msg_len;
304 char buf[RDBUFSZ];
305 mFILE *fpout;
306 int block_count;
307
308 /* Split arc name into server and port */
309 if (!(cp = strchr(arcname, '/')))
310 return NULL;
311 strncpy(server, arcname, 1023);
312 server[MIN(1023,cp-arcname)] = 0;
313 port = atoi(cp+1);
314
315 /* Make and connect socket */
316 if (NULL == (host = gethostbyname(server))) {
317 perror("gethostbyname()");
318 return NULL;
319 }
320 saddr.sin_port = htons(port);
321 saddr.sin_family = host->h_addrtype;
322 memcpy(&saddr.sin_addr,host->h_addr_list[0], host->h_length);
323 if ((s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == -1) {
324 perror("socket()");
325 return NULL;
326 }
327 if (connect(s, (struct sockaddr *)&saddr, sizeof(saddr)) == -1) {
328 perror("connect()");
329 return NULL;
330 }
331
332 /* The minimal message to send down is "--scf tracename" */
333 sprintf(msg, "--scf %.*s\n", 1000, file);
334 msg_len = strlen(msg);
335 if (send(s, msg, msg_len, 0) != msg_len) {
336 /*
337 * partial request sent, but requests are short so if this
338 * happens it's unlikely we'll cure it by sending multiple
339 * fragments.
340 */
341 /* close(s); */
342 return NULL;
343 }
344
345 /*
346 * Create a fake FILE (mFILE) and write to it.
347 */
348 fpout = mfcreate(NULL, 0);
349
350 /*
351 * Read the data back, in multiple blocks if necessary and write it
352 * to our temporary file. We use a blocking read with a low timeout to
353 * prevent locking up the application indefinitely.
354 */
355 {
356 struct timeval tv = {0, 10000};
357 setsockopt(s, SOL_SOCKET, SO_RCVTIMEO, (char *)&tv, sizeof(tv));
358 }
359 errno = 0;
360 block_count = 200;
361 while ((msg_len = read(s, buf, RDBUFSZ)) > 0 ||
362 (errno == EWOULDBLOCK && --block_count)) {
363 errno = 0;
364 if (msg_len > 0)
365 mfwrite(buf, 1, msg_len, fpout);
366 }
367 close(s);
368
369 if (!block_count) {
370 mfclose(fpout);
371 return NULL;
372 }
373
374 mrewind(fpout);
375
376 return fpout;
377 }
378 #endif
379
380 #ifdef USE_WGET
381 /* NB: non-reentrant due to reuse of handle */
382 static mFILE *find_file_url(char *file, char *url) {
383 char buf[8192], *cp;
384 mFILE *fp;
385 int pid;
386 int maxlen = 8190 - strlen(file);
387 char *fname = tempnam(NULL, NULL);
388 int status;
389
390 /* Expand %s for the trace name */
391 for (cp = buf; *url && cp - buf < maxlen; url++) {
392 if (*url == '%' && *(url+1) == 's') {
393 url++;
394 cp += strlen(strcpy(cp, file));
395 } else {
396 *cp++ = *url;
397 }
398 }
399 *cp++ = 0;
400
401 /* Execute wget */
402 if ((pid = fork())) {
403 waitpid(pid, &status, 0);
404 } else {
405 execlp("wget", "wget", "-q", "-O", fname, buf, NULL);
406 }
407
408 /* Return a filepointer to the result (if it exists) */
409 fp = (!status && file_size(fname) != 0) ? mfopen(fname, "rb") : NULL;
410 remove(fname);
411 free(fname);
412
413 return fp;
414 }
415 #endif
416
417 #ifdef HAVE_LIBCURL
418 static mFILE *find_file_url(char *file, char *url) {
419 char buf[8192], *cp;
420 mFILE *mf = NULL, *headers = NULL;
421 int maxlen = 8190 - strlen(file);
422 static CURL *handle = NULL;
423 static int curl_init = 0;
424 char errbuf[CURL_ERROR_SIZE];
425
426 *errbuf = 0;
427
428 if (!curl_init) {
429 if (curl_global_init(CURL_GLOBAL_ALL))
430 return NULL;
431
432 if (NULL == (handle = curl_easy_init()))
433 goto error;
434
435 curl_init = 1;
436 }
437
438 /* Expand %s for the trace name */
439 for (cp = buf; *url && cp - buf < maxlen; url++) {
440 if (*url == '%' && *(url+1) == 's') {
441 url++;
442 cp += strlen(strcpy(cp, file));
443 } else {
444 *cp++ = *url;
445 }
446 }
447 *cp++ = 0;
448
449 /* Setup the curl */
450 if (NULL == (mf = mfcreate(NULL, 0)) ||
451 NULL == (headers = mfcreate(NULL, 0)))
452 return NULL;
453
454 if (0 != curl_easy_setopt(handle, CURLOPT_URL, buf))
455 goto error;
456 if (0 != curl_easy_setopt(handle, CURLOPT_TIMEOUT, 10L))
457 goto error;
458 if (0 != curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, mfwrite))
459 goto error;
460 if (0 != curl_easy_setopt(handle, CURLOPT_WRITEDATA, mf))
461 goto error;
462 if (0 != curl_easy_setopt(handle, CURLOPT_HEADERFUNCTION, mfwrite))
463 goto error;
464 if (0 != curl_easy_setopt(handle, CURLOPT_WRITEHEADER, headers))
465 goto error;
466 if (0 != curl_easy_setopt(handle, CURLOPT_ERRORBUFFER, errbuf))
467 goto error;
468
469 /* Fetch! */
470 if (0 != curl_easy_perform(handle))
471 goto error;
472
473 /* Report errors is approproate. 404 is silent as it may have just been
474 * a search via RAWDATA path, everything else is worth reporting.
475 */
476 {
477 float version;
478 int response;
479 char nul = 0;
480 mfwrite(&nul, 1, 1, headers);
481 if (2 == sscanf(headers->data, "HTTP/%f %d", &version, &response)) {
482 if (response != 200) {
483 if (response != 404)
484 fprintf(stderr, "%.*s\n",
485 (int)headers->size, headers->data);
486 goto error;
487 }
488 }
489 }
490
491 if (mftell(mf) == 0)
492 goto error;
493
494 mfdestroy(headers);
495
496 mrewind(mf);
497 return mf;
498
499 error:
500 if (mf)
501 mfdestroy(mf);
502 if (headers)
503 mfdestroy(headers);
504 if (*errbuf)
505 fprintf(stderr, "%s\n", errbuf);
506 return NULL;
507 }
508 #endif
509
510 /*
511 * Takes an SFF file in 'data' and edits the header to ensure
512 * that it has no index listed and only claims to contain a single entry.
513 * This isn't strictly necessary for the sff/sff.c reading code, but it is
514 * the 'Right Thing' to do.
515 *
516 * Returns an mFILE on success or NULL on failure.
517 */
518 static mFILE *sff_single(char *data, size_t size) {
519 *(uint64_t *)(data+8) = be_int8(0); /* index offset */
520 *(uint32_t *)(data+16) = be_int4(0); /* index size */
521 *(uint32_t *)(data+20) = be_int4(1); /* number of reads */
522
523 return mfcreate(data, size);
524 }
525
526 /* Hash (.hsh) format index searching for SFF files */
527 static mFILE *sff_hash_query(char *sff, char *entry, FILE *fp) {
528 static HashFile *hf = NULL;
529 static char sff_copy[1024];
530 static FILE *fp_copy = NULL;
531 char *data;
532 size_t size;
533
534 /* Cache an open HashFile for fast accessing */
535 if (strcmp(sff, sff_copy) != 0) {
536 if (hf) {
537 hf->afp = hf->hfp = NULL; /* will be closed by our parent */
538 HashFileDestroy(hf);
539 }
540 fseek(fp, -4, SEEK_CUR);
541 if (NULL == (hf = HashFileFopen(fp)))
542 return NULL;
543
544 strcpy(sff_copy, sff);
545 fp_copy = fp;
546 }
547
548 data = HashFileExtract(hf, entry, &size);
549
550 return data ? sff_single(data, size) : NULL;
551 }
552
553
554 /*
555 * getuint4_255
556 *
557 * A function to convert a 4-byte TVF/SFF value into an integer, where
558 * the bytes are base 255 numbers. This is used to store the index offsets.
559 */
560 static unsigned int getuint4_255(unsigned char *b)
561 {
562 return
563 ((unsigned int) b[0]) * 255 * 255 * 255 +
564 ((unsigned int) b[1]) * 255 * 255 +
565 ((unsigned int) b[2]) * 255 +
566 ((unsigned int) b[3]);
567 }
568
569 /*
570 * 454 sorted format (.srt) index searching for SFF files.
571 * Uses a binary search.
572 * This function and getuint4_255 above are taken with permission
573 * from 454's getsff.c with the following licence:
574 *
575 * Copyright (c)[2001-2005] 454 Life Sciences Corporation. All Rights Reserved.
576 *
577 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
578 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
579 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
580 *
581 * IN NO EVENT SHALL LICENSOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
582 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
583 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE.
584 *
585 * Permission to use, copy, modify and distribute this software and its
586 * documentation for any purpose is hereby granted without fee, provided
587 * that this copyright and notice appears in all copies.
588 */
589 static mFILE *sff_sorted_query(char *sff, char *accno, FILE *fp,
590 uint32_t index_length) {
591 static unsigned char *index;
592 static char sff_copy[1024];
593 unsigned char *us;
594 uint32_t start, end;
595 uint32_t offset;
596 char *data = NULL;
597 static char chdr[1024];
598 static int chdrlen = 0, nflows = 0;
599 char rhdr[1024];
600 int rhdrlen;
601 int nbases, dlen;
602 int bytes_per_flow = 2;
603
604 /* Cache index if we're querying the same SFF file */
605 if (strcmp(sff_copy, sff) != 0) {
606 if (index)
607 xfree(index);
608 if (NULL == (index = (unsigned char *)xmalloc(index_length)))
609 return NULL;
610
611 if (index_length != fread(index, 1, index_length, fp)) {
612 xfree(index);
613 return NULL;
614 }
615 strcpy(sff_copy, sff);
616
617 /* Read the common header too - minimal decoding necessary */
618 fseek(fp, 0, SEEK_SET);
619 if (31 != fread(chdr, 1, 31, fp))
620 return NULL;
621 chdrlen = be_int2(*(uint16_t *)(chdr+24));
622 nflows = be_int2(*(uint16_t *)(chdr+28));
623 if (chdrlen-31 != fread(chdr+31, 1, chdrlen-31, fp))
624 return NULL;
625 }
626
627 /*
628 * Perform a binary search of the index, stopping when the search
629 * region becomes relatively small. This assumes that no accession
630 * number is near 200 characters.
631 */
632 start = 0;
633 end = index_length;
634 while (end - start > 200) {
635 uint32_t mid;
636 int val;
637 mid = (start + end) / 2;
638
639 /*
640 * From the byte midpoint, scan backwards to the beginning of the
641 * index record that covers that byte midpoint.
642 */
643 while (mid > start && index[mid-1] != 255) {
644 mid--;
645 }
646 val = strcmp(accno, (char *)(index+mid));
647
648 if (val == 0) {
649 break;
650 } else if (val < 0) {
651 end = mid;
652 } else {
653 start = mid;
654 }
655 }
656
657 /*
658 * Scan through the small search region, looking for the accno.
659 */
660 while (start < end) {
661 if (strcmp(accno, (char *)(index+start)) == 0) {
662 /*
663 * If the accno is found, skip the accno characters,
664 * then get the record offset.
665 */
666 for (us=index+start; *us; us++,start++) ;
667 us++;
668 start++;
669
670 offset = getuint4_255(us);
671 if (us[4] != 255) {
672 return NULL;
673 }
674
675 /*
676 * The original getsff.c here computed the record size by
677 * looking at the next index item and comparing it's offset to
678 * this one, or the end of file position if this is the last
679 * item. This has two problems:
680 * 1: It means the index itself cannot be added to the end of
681 * the file.
682 * 2: It means that we cannot simply add an index to a SFF
683 * file without also reordering all of the items within it.
684 *
685 * We solve this by reading the read header to work out the
686 * object size instead.
687 */
688 break;
689 }
690
691 /*
692 * Skip to the beginning of the next index element.
693 */
694 while (start < end && index[start] != 255) {
695 start++;
696 }
697 start++;
698 }
699
700 /*
701 * Now offset indicates the position of the SFF entry. Read and decode
702 * header to get data length. Then read this too.
703 */
704 fseek(fp, offset, SEEK_SET);
705 if (16 != fread(rhdr, 1, 16, fp))
706 return NULL;
707
708 rhdrlen = be_int2(*(uint16_t *)rhdr);
709 nbases = be_int4(*(uint32_t *)(rhdr+4));
710
711 if (rhdrlen-16 != fread(rhdr+16, 1, rhdrlen-16, fp))
712 return NULL;
713 dlen = (nflows * bytes_per_flow + nbases * 3 + 7) & ~7;
714
715 /* Built up the fake SFF entry */
716 if (NULL == (data = (char *)xmalloc(chdrlen + rhdrlen + dlen)))
717 return NULL;
718
719 memcpy(data, chdr, chdrlen);
720 memcpy(data + chdrlen, rhdr, rhdrlen);
721 if (dlen != fread(data + chdrlen + rhdrlen, 1, dlen, fp)) {
722 xfree(data);
723 return NULL;
724 }
725
726 /* Convert to mFILE */
727 return sff_single(data, chdrlen + rhdrlen + dlen);
728 }
729
730
731 /*
732 * This returns an mFILE containing an SFF entry.
733 *
734 * This does the minimal decoding necessary to skip through the SFF
735 * container to find an entry. In this respect it is a semi-duplication
736 * of sff/sff.[ch], but implemented for efficiency.
737 *
738 * Having found an entry it packs the common header, the read specific
739 * header and the read data into a single block of memory and returns this
740 * as an mFILE. In essence it produces a single-read SFF archive. This
741 * is then decoded by the normal sff parsing code representing a small
742 * amount of redundancy, but one which is swamped by the I/O time.
743 */
744 static mFILE *find_file_sff(char *entry, char *sff) {
745 static FILE *fp = NULL;
746 static char sff_copy[1024];
747 char chdr[65536], rhdr[65536]; /* generous, but worst case */
748 uint32_t nkey, nflows, chdrlen, rhdrlen, dlen, magic;
749 uint64_t file_pos;
750 static uint64_t index_offset = 0;
751 static uint32_t index_length = 0;
752 static char index_format[8];
753 uint32_t nreads, i;
754 size_t entry_len = strlen(entry);
755 int bytes_per_flow = 2;
756 char *fake_file;
757
758 /*
759 * Check cached information so rapid queries to the same archive are
760 * fast.
761 * ASSUMPTION: we won't externally replace the sff file with another of
762 * the same name.
763 */
764 if (strcmp(sff, sff_copy) == 0) {
765 if (memcmp(index_format, ".hsh1.00", 8) == 0) {
766 return sff_hash_query(sff, entry, fp);
767 } else if (memcmp(index_format, ".srt1.00", 8) == 0) {
768 return sff_sorted_query(sff, entry, fp, index_length-8);
769 }
770 }
771
772 if (fp)
773 fclose(fp);
774
775 strcpy(sff_copy, sff);
776 *index_format = 0;
777
778
779 /* Read the common header */
780 if (NULL == (fp = fopen(sff, "rb")))
781 return NULL;
782 if (31 != fread(chdr, 1, 31, fp))
783 return NULL;
784
785 /* Check magic & vers: TODO */
786 magic = be_int4(*(uint32_t *)chdr);
787 if (magic != SFF_MAGIC)
788 return NULL;
789 if (memcmp(chdr+4, SFF_VERSION, 4) != 0)
790 return NULL;
791
792 /* If we have an index, use it, otherwise search linearly */
793 index_offset = be_int8(*(uint64_t *)(chdr+8));
794 index_length = be_int4(*(uint32_t *)(chdr+16));
795 if (index_length != 0) {
796 long orig_pos = ftell(fp);
797 fseek(fp, index_offset, SEEK_SET);
798 fread(index_format, 1, 8, fp);
799
800 if (memcmp(index_format, ".hsh1.00", 8) == 0) {
801 /* HASH index v1.00 */
802 return sff_hash_query(sff, entry, fp);
803
804 } else if (memcmp(index_format, ".srt1.00", 8) == 0) {
805 /* 454 sorted v1.00 */
806 return sff_sorted_query(sff, entry, fp, index_length-8);
807 } else {
808 /* Unknown index: revert back to a slow linear scan */
809 fseek(fp, orig_pos, SEEK_SET);
810 }
811 }
812
813 nreads = be_int4(*(uint32_t *)(chdr+20));
814 chdrlen = be_int2(*(uint16_t *)(chdr+24));
815 nkey = be_int2(*(uint16_t *)(chdr+26));
816 nflows = be_int2(*(uint16_t *)(chdr+28));
817
818 /* Read the remainder of the header */
819 if (chdrlen-31 != fread(chdr+31, 1, chdrlen-31, fp))
820 return NULL;
821
822 file_pos = chdrlen;
823
824 /* Loop until we find the correct entry */
825 for (i = 0; i < nreads; i++) {
826 uint16_t name_len;
827 uint32_t nbases;
828
829 /* Index could be between common header and first read - skip */
830 if (file_pos == index_offset) {
831 fseek(fp, index_length, SEEK_CUR);
832 file_pos += index_length;
833 }
834
835 /* Read 16 bytes to get name length */
836 if (16 != fread(rhdr, 1, 16, fp))
837 return NULL;
838 rhdrlen = be_int2(*(uint16_t *)rhdr);
839 name_len = be_int2(*(uint16_t *)(rhdr+2));
840 nbases = be_int4(*(uint32_t *)(rhdr+4));
841
842 /* Read the rest of the header */
843 if (rhdrlen-16 != fread(rhdr+16, 1, rhdrlen-16, fp))
844 return NULL;
845
846 file_pos += rhdrlen;
847
848 dlen = (nflows * bytes_per_flow + nbases * 3 + 7) & ~7;
849
850 if (name_len == entry_len && 0 == memcmp(rhdr+16, entry, entry_len))
851 break;
852
853 /* This is not the read you are looking for... */
854 fseek(fp, dlen, SEEK_CUR);
855 }
856
857 if (i == nreads) {
858 /* Not found */
859 return NULL;
860 }
861
862 /*
863 * Although we've decoded some bits already, we take the more modular
864 * approach of packing the sections together and passing the entire
865 * data structure off as a single-read SFF file to be decoded fully
866 * by the sff reading code.
867 */
868 if (NULL == (fake_file = (char *)xmalloc(chdrlen + rhdrlen + dlen)))
869 return NULL;
870
871 memcpy(fake_file, chdr, chdrlen);
872 memcpy(fake_file+chdrlen, rhdr, rhdrlen);
873 if (dlen != fread(fake_file+chdrlen+rhdrlen, 1, dlen, fp)) {
874 xfree(fake_file);
875 return NULL;
876 }
877
878 /* Convert to an mFILE and return */
879 return sff_single(fake_file, chdrlen+rhdrlen+dlen);
880 }
881
882 /*
883 * Searches for file in the directory 'dirname'. If it finds it, it opens
884 * it. This also searches for compressed versions of the file in dirname
885 * too.
886 *
887 * Returns mFILE pointer if found
888 * NULL if not
889 */
890 static mFILE *find_file_dir(char *file, char *dirname) {
891 char path[PATH_MAX+1], path2[PATH_MAX+1];
892 size_t len = strlen(dirname);
893 char *cp;
894
895 if (dirname[len-1] == '/')
896 len--;
897
898 /* Special case for "./" or absolute filenames */
899 if (*file == '/' || (len==1 && *dirname == '.'))
900 sprintf(path, "%s", file);
901 else
902 sprintf(path, "%.*s/%s", (int)len, dirname, file);
903
904 if (is_file(path)) {
905 return mfopen(path, "rb");
906 }
907
908 /*
909 * Given a pathname /a/b/c if a/b is a file and not a directory then
910 * we'd get an ENOTDIR error. Instead we assume that a/b is an archive
911 * and we attempt to work out what type by reading the first and last
912 * bits of the file.
913 */
914 if (cp = strrchr(file, '/')) {
915 strcpy(path2, path); /* path contains / too as it's from file */
916 *strrchr(path2, '/') = 0;
917
918 if (is_file(path2)) {
919 /* Open the archive to test for magic numbers */
920 char magic[8];
921 FILE *fp;
922 enum archive_type_t {
923 NONE, HASH, TAR, SFF, SRF
924 } type = NONE;
925
926 if (NULL == (fp = fopen(path2, "rb")))
927 return NULL;
928 memcpy(magic, "\0\0\0\0\0\0", 4);
929 fread(magic, 1, 4, fp);
930
931 /* .hsh or .sff at start */
932 if (memcmp(magic, ".hsh", 4) == 0)
933 type = HASH;
934 else if (memcmp(magic, ".sff", 4) == 0)
935 type = SFF;
936
937 /* Or .hsh or Ihsh at the end */
938 if (NONE == type) {
939 fseek(fp, -16, SEEK_END);
940 fread(magic, 1, 8, fp);
941 if (memcmp(magic+4, ".hsh", 4) == 0)
942 type = HASH;
943 else if (memcmp(magic, "Ihsh", 4) == 0)
944 type = SRF;
945 }
946
947 /* or ustar 257 bytes in to indicate un-hashed tar */
948 if (NONE == type) {
949 fseek(fp, 257, SEEK_SET);
950 fread(magic, 1, 5, fp);
951 if (memcmp(magic, "ustar", 5) == 0)
952 type = TAR;
953 }
954 fclose(fp);
955
956 switch (type) {
957 case HASH:
958 return find_file_hash(cp+1, path2);
959 case TAR:
960 return find_file_tar(cp+1, path2, 0);
961 case SFF:
962 return find_file_sff(cp+1, path2);
963 case SRF:
964 return find_file_srf(cp+1, path2);
965 case NONE:
966 break;
967 }
968
969 return NULL;
970 }
971 }
972
973 return NULL;
974 }
975
976 /*
977 * ------------------------------------------------------------------------
978 * Public functions below.
979 */
980
981 /*
982 * Opens a trace file named 'file'. This is initially looked for as a
983 * pathname relative to a file named "relative_to". This may (for
984 * example) be the name of an experiment file referencing the trace
985 * file. In this case by passing relative_to as the experiment file
986 * filename the trace file will be picked up in the same directory as
987 * the experiment file. Relative_to may be supplied as NULL.
988 *
989 * 'file' is looked for at relative_to, then the current directory, and then
990 * all of the locations listed in 'path' (which is a colon separated list).
991 * If 'path' is NULL it uses the RAWDATA environment variable instead.
992 *
993 * Returns a mFILE pointer when found.
994 * NULL otherwise.
995 */
996 mFILE *open_path_mfile(char *file, char *path, char *relative_to) {
997 char *newsearch;
998 char *ele;
999 mFILE *fp;
1000
1001 /* Use path first */
1002 if (!path)
1003 path = getenv("RAWDATA");
1004 if (NULL == (newsearch = tokenise_search_path(path)))
1005 return NULL;
1006
1007 /*
1008 * Step through the search path testing out each component.
1009 * We now look through each path element treating some prefixes as
1010 * special, otherwise we treat the element as a directory.
1011 */
1012 for (ele = newsearch; *ele; ele += strlen(ele)+1) {
1013 int i;
1014 char *suffix[6] = {"", ".gz", ".bz2", ".sz", ".Z", ".bz2"};
1015 for (i = 0; i < 6; i++) {
1016 char file2[1024];
1017 char *ele2;
1018 int valid = 1;
1019
1020 /*
1021 * '|' prefixing a path component indicates that we do not
1022 * wish to perform the compression extension searching in that
1023 * location.
1024 */
1025 if (*ele == '|') {
1026 ele2 = ele+1;
1027 valid = (i == 0);
1028 } else {
1029 ele2 = ele;
1030 }
1031
1032 sprintf(file2, "%s%s", file, suffix[i]);
1033
1034 if (0 == strncmp(ele2, "TAR=", 4)) {
1035 if (valid && (fp = find_file_tar(file2, ele2+4, 0))) {
1036 free(newsearch);
1037 return fp;
1038 }
1039
1040 } else if (0 == strncmp(ele2, "HASH=", 5)) {
1041 if (valid && (fp = find_file_hash(file2, ele2+5))) {
1042 free(newsearch);
1043 return fp;
1044 }
1045 #ifdef TRACE_ARCHIVE
1046 } else if (0 == strncmp(ele2, "ARC=", 4)) {
1047 if (valid && (fp = find_file_archive(file2, ele2+4))) {
1048 free(newsearch);
1049 return fp;
1050 }
1051 #endif
1052 #if defined(USE_WGET) || defined(HAVE_LIBCURL)
1053 } else if (0 == strncmp(ele2, "URL=", 4)) {
1054 if (valid && (fp = find_file_url(file2, ele2+4))) {
1055 free(newsearch);
1056 return fp;
1057 }
1058 #endif
1059 } else if (0 == strncmp(ele2, "SFF=", 4)) {
1060 if (valid && (fp = find_file_sff(file2, ele2+4))) {
1061 free(newsearch);
1062 return fp;
1063 }
1064
1065 } else if (0 == strncmp(ele2, "SRF=", 4)) {
1066 if (valid && (fp = find_file_srf(file2, ele2+4))) {
1067 free(newsearch);
1068 return fp;
1069 }
1070
1071 } else {
1072 if (valid && (fp = find_file_dir(file2, ele2))) {
1073 free(newsearch);
1074 return fp;
1075 }
1076 }
1077 }
1078 }
1079
1080 free(newsearch);
1081
1082 /* Look in the same location as the incoming 'relative_to' filename */
1083 if (relative_to) {
1084 char *cp;
1085 char relative_path[PATH_MAX+1];
1086 strcpy(relative_path, relative_to);
1087 if (cp = strrchr(relative_path, '/'))
1088 *cp = 0;
1089 if (fp = find_file_dir(file, relative_path))
1090 return fp;
1091 }
1092
1093 return NULL;
1094 }
1095
1096 FILE *open_path_file(char *file, char *path, char *relative_to) {
1097 mFILE *mf = open_path_mfile(file, path, relative_to);
1098 FILE *fp;
1099
1100 if (!mf)
1101 return NULL;
1102
1103 if (mf->fp)
1104 return mf->fp;
1105
1106 /* Secure temporary file generation */
1107 if (NULL == (fp = tmpfile()))
1108 return NULL;
1109
1110 /* Copy the data */
1111 fwrite(mf->data, 1, mf->size, fp);
1112 rewind(fp);
1113 mfclose(mf);
1114
1115 return fp;
1116 }
1117
1118 static char *exp_path = NULL;
1119 static char *trace_path = NULL;
1120
1121 void iolib_set_trace_path(char *path) { trace_path = path; }
1122 char *iolib_get_trace_path(void) { return trace_path; }
1123 void iolib_set_exp_path (char *path) { exp_path = path; }
1124 char *iolib_get_exp_path (void) { return exp_path; }
1125
1126 /*
1127 * Trace file functions: uses TRACE_PATH environment variable.
1128 */
1129 mFILE *open_trace_mfile(char *file, char *rel_to) {
1130 return open_path_mfile(file, trace_path ? trace_path
1131 : getenv("TRACE_PATH"), rel_to);
1132 }
1133
1134 FILE *open_trace_file(char *file, char *rel_to) {
1135 return open_path_file(file, trace_path ? trace_path
1136 : getenv("TRACE_PATH"), rel_to);
1137 }
1138
1139 /*
1140 * Trace file functions: uses EXP_PATH environment variable.
1141 */
1142 mFILE *open_exp_mfile(char *file, char *relative_to) {
1143 return open_path_mfile(file, exp_path ? exp_path
1144 : getenv("EXP_PATH"), relative_to);
1145 }
1146
1147 FILE *open_exp_file(char *file, char *relative_to) {
1148 return open_path_file(file, exp_path ? exp_path
1149 : getenv("EXP_PATH"), relative_to);
1150 }
1151