Mercurial > repos > dawe > srf2fastq
comparison srf2fastq/io_lib-1.12.2/io_lib/open_trace_file.c @ 0:d901c9f41a6a default tip
Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author | dawe |
---|---|
date | Tue, 07 Jun 2011 17:48:05 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d901c9f41a6a |
---|---|
1 #ifdef HAVE_CONFIG_H | |
2 # include "io_lib_config.h" | |
3 #endif | |
4 | |
5 #if !(defined(_MSC_VER) || defined(__MINGW32__)) | |
6 # define TRACE_ARCHIVE | |
7 # ifndef HAVE_LIBCURL | |
8 # define USE_WGET | |
9 # endif | |
10 #endif | |
11 | |
12 #include <stdlib.h> | |
13 #include <stdio.h> | |
14 #include <string.h> | |
15 #include <unistd.h> | |
16 #include <ctype.h> | |
17 #include <limits.h> | |
18 #include "io_lib/os.h" | |
19 #ifdef TRACE_ARCHIVE | |
20 # include <sys/socket.h> | |
21 # include <netinet/in.h> | |
22 # include <netdb.h> | |
23 # include <sys/time.h> | |
24 # include <errno.h> | |
25 #endif | |
26 #ifdef USE_WGET | |
27 # include <sys/wait.h> | |
28 #endif | |
29 #ifndef PATH_MAX | |
30 # define PATH_MAX 1024 | |
31 #endif | |
32 #ifdef HAVE_LIBCURL | |
33 # include <curl/curl.h> | |
34 #endif | |
35 | |
36 #include "io_lib/open_trace_file.h" | |
37 #include "io_lib/misc.h" | |
38 #include "io_lib/tar_format.h" | |
39 #include "io_lib/compress.h" | |
40 #include "io_lib/hash_table.h" | |
41 #include "io_lib/sff.h" | |
42 #include "io_lib/srf.h" | |
43 | |
44 /* | |
45 * Supported compression extensions. See the magics array in compress.c for | |
46 * the full structure. | |
47 */ | |
48 static char *magics[] = {"", ".bz", ".gz", ".Z", ".z", ".bz2", ".sz"}; | |
49 | |
50 /* | |
51 * Tokenises the search path splitting on colons (unix) or semicolons (windows). | |
52 * We also explicitly add a "./" to the end of the search path | |
53 * | |
54 * Returns: A new search path with items separated by nul chars. Two nul | |
55 * chars in a row represent the end of the tokenised path. | |
56 * Returns NULL for a failure. | |
57 * | |
58 * The returned data has been malloced. It is up to the caller to free this | |
59 * memory. | |
60 */ | |
61 static char *tokenise_search_path(char *searchpath) { | |
62 char *newsearch; | |
63 unsigned int i, j; | |
64 size_t len; | |
65 #ifdef _WIN32 | |
66 char path_sep = ';'; | |
67 #else | |
68 char path_sep = ':'; | |
69 #endif | |
70 | |
71 if (!searchpath) | |
72 searchpath=""; | |
73 | |
74 newsearch = (char *)malloc((len = strlen(searchpath))+5); | |
75 if (!newsearch) | |
76 return NULL; | |
77 | |
78 for (i = 0, j = 0; i < len; i++) { | |
79 /* "::" => ":". Used for escaping colons in http://foo */ | |
80 if (i < len-1 && searchpath[i] == ':' && searchpath[i+1] == ':') { | |
81 newsearch[j++] = ':'; | |
82 i++; | |
83 continue; | |
84 } | |
85 | |
86 if (searchpath[i] == path_sep) { | |
87 /* Skip blank path components */ | |
88 if (j && newsearch[j-1] != 0) | |
89 newsearch[j++] = 0; | |
90 } else { | |
91 newsearch[j++] = searchpath[i]; | |
92 } | |
93 } | |
94 | |
95 if (j) | |
96 newsearch[j++] = 0; | |
97 newsearch[j++] = '.'; | |
98 newsearch[j++] = '/'; | |
99 newsearch[j++] = 0; | |
100 newsearch[j++] = 0; | |
101 | |
102 return newsearch; | |
103 } | |
104 | |
105 /* | |
106 * Searches for file in the tar pointed to by tarname. If it finds it, it | |
107 * copies it out and returns a file pointer to the temporary file, | |
108 * otherwise we return NULL. | |
109 * | |
110 * If 'tarname'.index exists we will use this as a fast lookup method, | |
111 * otherwise we just do a sequential search through the tar. | |
112 * | |
113 * Offset specifies a starting search position. Set this to zero if you want | |
114 * to search through the entire tar file, otherwise set it to the byte offset | |
115 * into the file of the tar header block for the desired file to extract. | |
116 * (Note that the tar index file overrides this value.) | |
117 * | |
118 * Returns mFILE pointer if found | |
119 * NULL if not. | |
120 */ | |
121 static mFILE *find_file_tar(char *file, char *tarname, size_t offset) { | |
122 int num_magics = sizeof(magics) / sizeof(*magics); | |
123 char path[PATH_MAX+101]; | |
124 FILE *fp; | |
125 tar_block blk; | |
126 int size; | |
127 int name_len = strlen(file); | |
128 | |
129 /* Maximum name length for a tar file */ | |
130 if (name_len > 100) | |
131 return NULL; | |
132 | |
133 /* Search the .index file */ | |
134 sprintf(path, "%s.index", tarname); | |
135 if (file_exists(path)) { | |
136 FILE *fpind = fopen(path, "r"); | |
137 char *cp; | |
138 int tmp_off; | |
139 int found = 0; | |
140 | |
141 if (fpind) { | |
142 while (fgets(path, PATH_MAX+100, fpind)) { | |
143 if (cp = strchr(path, '\n')) | |
144 *cp = 0; | |
145 tmp_off = strtol(path, &cp, 10); | |
146 while (isspace(*cp)) | |
147 cp++; | |
148 if (strncmp(cp, file, name_len) == 0) { | |
149 int i; | |
150 for (i = 0; i < num_magics; i++) { | |
151 if (strcmp(&cp[name_len], magics[i]) == 0) { | |
152 offset = tmp_off; | |
153 found = 1; | |
154 break; | |
155 } | |
156 } | |
157 if (found) | |
158 break; | |
159 } | |
160 } | |
161 fclose(fpind); | |
162 | |
163 /* Not in index */ | |
164 if (!found) | |
165 return NULL; | |
166 } | |
167 } | |
168 | |
169 if (NULL == (fp = fopen(tarname, "rb"))) | |
170 return NULL; | |
171 | |
172 /* | |
173 * Search through the tar file (starting from index position) looking | |
174 * for our filename. If there was no index then we start from position 0. | |
175 */ | |
176 fseek(fp, offset, SEEK_SET); | |
177 while(fread(&blk, sizeof(blk), 1, fp) == 1) { | |
178 if (!blk.header.name[0]) | |
179 break; | |
180 | |
181 size = strtol(blk.header.size, NULL, 8); | |
182 | |
183 /* start with the same name... */ | |
184 if (strncmp(blk.header.name, file, name_len) == 0) { | |
185 char *data; | |
186 int i; | |
187 | |
188 /* ... but does it end with a known compression extension? */ | |
189 for (i = 0; i < num_magics; i++) { | |
190 if (strcmp(&blk.header.name[name_len], magics[i]) == 0) { | |
191 break; | |
192 } | |
193 } | |
194 /* ... apparently not? continue then */ | |
195 if (i == num_magics) | |
196 continue; | |
197 | |
198 /* Found it - copy out the data to an mFILE */ | |
199 if (NULL == (data = (char *)malloc(size))) | |
200 return NULL; | |
201 if (size != fread(data, 1, size, fp)) { | |
202 free(data); | |
203 return NULL; | |
204 } | |
205 return mfcreate(data, size); | |
206 } | |
207 | |
208 fseek(fp, TBLOCK*((size+TBLOCK-1)/TBLOCK), SEEK_CUR); | |
209 } | |
210 | |
211 fclose(fp); | |
212 return NULL; | |
213 } | |
214 | |
215 /* | |
216 * Reads a hash file to look for a filename. The hash file contains the | |
217 * (relative) pathname for the file it is an index for along with the | |
218 * positions and sizes of each file contained within it. The file format | |
219 * of the archive itself is irrelevant provided that the data is not | |
220 * internally compressed in some manner specific to that archive. | |
221 * | |
222 * Return mFILE pointer if found | |
223 * NULL if not | |
224 */ | |
225 static mFILE *find_file_hash(char *file, char *hashfile) { | |
226 size_t size; | |
227 static HashFile *hf = NULL; | |
228 static char hf_name[1024]; | |
229 char *data; | |
230 | |
231 /* Cache an open HashFile for fast accesing */ | |
232 if (strcmp(hashfile, hf_name) != 0) { | |
233 if (hf) | |
234 HashFileDestroy(hf); | |
235 hf = HashFileOpen(hashfile); | |
236 | |
237 if (!hf) | |
238 return NULL; | |
239 strcpy(hf_name, hashfile); | |
240 } | |
241 | |
242 /* Search */ | |
243 if (NULL == (data = HashFileExtract(hf, file, &size))) | |
244 return NULL; | |
245 | |
246 /* Found, so copy the contents to a fake FILE pointer */ | |
247 return mfcreate(data, size); | |
248 } | |
249 | |
250 /* | |
251 * Extracts a single trace from an SRF file. | |
252 * | |
253 * Return mFILE pointer if found | |
254 * NULL if not | |
255 */ | |
256 static mFILE *find_file_srf(char *tname, char *srffile) { | |
257 srf_t *srf; | |
258 uint64_t cpos, hpos, dpos; | |
259 mFILE *mf = NULL; | |
260 char *cp; | |
261 | |
262 if (NULL == (srf = srf_open(srffile, "r"))) | |
263 return NULL; | |
264 | |
265 if (NULL != (cp = strrchr(tname, '/'))) | |
266 tname = cp+1; | |
267 | |
268 if (0 == srf_find_trace(srf, tname, &cpos, &hpos, &dpos)) { | |
269 char *data = malloc(srf->th.trace_hdr_size + srf->tb.trace_size); | |
270 if (!data) { | |
271 srf_destroy(srf, 1); | |
272 return NULL; | |
273 } | |
274 memcpy(data, srf->th.trace_hdr, srf->th.trace_hdr_size); | |
275 memcpy(data + srf->th.trace_hdr_size, | |
276 srf->tb.trace, srf->tb.trace_size); | |
277 mf = mfcreate(data, srf->th.trace_hdr_size + srf->tb.trace_size); | |
278 } | |
279 | |
280 srf_destroy(srf, 1); | |
281 return mf; | |
282 } | |
283 | |
284 #ifdef TRACE_ARCHIVE | |
285 /* | |
286 * Searches for file in the ensembl trace archive pointed to by arcname. | |
287 * If it finds it, it copies it out and returns a file pointer to the | |
288 * temporary file, otherwise we return NULL. | |
289 * | |
290 * Arcname has the form address:port, eg "titan/22100" | |
291 * | |
292 * Returns mFILE pointer if found | |
293 * NULL if not. | |
294 */ | |
295 #define RDBUFSZ 8192 | |
296 static mFILE *find_file_archive(char *file, char *arcname) { | |
297 char server[1024], *cp; | |
298 int port; | |
299 struct hostent *host; | |
300 struct sockaddr_in saddr; | |
301 int s = 0; | |
302 char msg[1024]; | |
303 ssize_t msg_len; | |
304 char buf[RDBUFSZ]; | |
305 mFILE *fpout; | |
306 int block_count; | |
307 | |
308 /* Split arc name into server and port */ | |
309 if (!(cp = strchr(arcname, '/'))) | |
310 return NULL; | |
311 strncpy(server, arcname, 1023); | |
312 server[MIN(1023,cp-arcname)] = 0; | |
313 port = atoi(cp+1); | |
314 | |
315 /* Make and connect socket */ | |
316 if (NULL == (host = gethostbyname(server))) { | |
317 perror("gethostbyname()"); | |
318 return NULL; | |
319 } | |
320 saddr.sin_port = htons(port); | |
321 saddr.sin_family = host->h_addrtype; | |
322 memcpy(&saddr.sin_addr,host->h_addr_list[0], host->h_length); | |
323 if ((s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == -1) { | |
324 perror("socket()"); | |
325 return NULL; | |
326 } | |
327 if (connect(s, (struct sockaddr *)&saddr, sizeof(saddr)) == -1) { | |
328 perror("connect()"); | |
329 return NULL; | |
330 } | |
331 | |
332 /* The minimal message to send down is "--scf tracename" */ | |
333 sprintf(msg, "--scf %.*s\n", 1000, file); | |
334 msg_len = strlen(msg); | |
335 if (send(s, msg, msg_len, 0) != msg_len) { | |
336 /* | |
337 * partial request sent, but requests are short so if this | |
338 * happens it's unlikely we'll cure it by sending multiple | |
339 * fragments. | |
340 */ | |
341 /* close(s); */ | |
342 return NULL; | |
343 } | |
344 | |
345 /* | |
346 * Create a fake FILE (mFILE) and write to it. | |
347 */ | |
348 fpout = mfcreate(NULL, 0); | |
349 | |
350 /* | |
351 * Read the data back, in multiple blocks if necessary and write it | |
352 * to our temporary file. We use a blocking read with a low timeout to | |
353 * prevent locking up the application indefinitely. | |
354 */ | |
355 { | |
356 struct timeval tv = {0, 10000}; | |
357 setsockopt(s, SOL_SOCKET, SO_RCVTIMEO, (char *)&tv, sizeof(tv)); | |
358 } | |
359 errno = 0; | |
360 block_count = 200; | |
361 while ((msg_len = read(s, buf, RDBUFSZ)) > 0 || | |
362 (errno == EWOULDBLOCK && --block_count)) { | |
363 errno = 0; | |
364 if (msg_len > 0) | |
365 mfwrite(buf, 1, msg_len, fpout); | |
366 } | |
367 close(s); | |
368 | |
369 if (!block_count) { | |
370 mfclose(fpout); | |
371 return NULL; | |
372 } | |
373 | |
374 mrewind(fpout); | |
375 | |
376 return fpout; | |
377 } | |
378 #endif | |
379 | |
380 #ifdef USE_WGET | |
381 /* NB: non-reentrant due to reuse of handle */ | |
382 static mFILE *find_file_url(char *file, char *url) { | |
383 char buf[8192], *cp; | |
384 mFILE *fp; | |
385 int pid; | |
386 int maxlen = 8190 - strlen(file); | |
387 char *fname = tempnam(NULL, NULL); | |
388 int status; | |
389 | |
390 /* Expand %s for the trace name */ | |
391 for (cp = buf; *url && cp - buf < maxlen; url++) { | |
392 if (*url == '%' && *(url+1) == 's') { | |
393 url++; | |
394 cp += strlen(strcpy(cp, file)); | |
395 } else { | |
396 *cp++ = *url; | |
397 } | |
398 } | |
399 *cp++ = 0; | |
400 | |
401 /* Execute wget */ | |
402 if ((pid = fork())) { | |
403 waitpid(pid, &status, 0); | |
404 } else { | |
405 execlp("wget", "wget", "-q", "-O", fname, buf, NULL); | |
406 } | |
407 | |
408 /* Return a filepointer to the result (if it exists) */ | |
409 fp = (!status && file_size(fname) != 0) ? mfopen(fname, "rb") : NULL; | |
410 remove(fname); | |
411 free(fname); | |
412 | |
413 return fp; | |
414 } | |
415 #endif | |
416 | |
417 #ifdef HAVE_LIBCURL | |
418 static mFILE *find_file_url(char *file, char *url) { | |
419 char buf[8192], *cp; | |
420 mFILE *mf = NULL, *headers = NULL; | |
421 int maxlen = 8190 - strlen(file); | |
422 static CURL *handle = NULL; | |
423 static int curl_init = 0; | |
424 char errbuf[CURL_ERROR_SIZE]; | |
425 | |
426 *errbuf = 0; | |
427 | |
428 if (!curl_init) { | |
429 if (curl_global_init(CURL_GLOBAL_ALL)) | |
430 return NULL; | |
431 | |
432 if (NULL == (handle = curl_easy_init())) | |
433 goto error; | |
434 | |
435 curl_init = 1; | |
436 } | |
437 | |
438 /* Expand %s for the trace name */ | |
439 for (cp = buf; *url && cp - buf < maxlen; url++) { | |
440 if (*url == '%' && *(url+1) == 's') { | |
441 url++; | |
442 cp += strlen(strcpy(cp, file)); | |
443 } else { | |
444 *cp++ = *url; | |
445 } | |
446 } | |
447 *cp++ = 0; | |
448 | |
449 /* Setup the curl */ | |
450 if (NULL == (mf = mfcreate(NULL, 0)) || | |
451 NULL == (headers = mfcreate(NULL, 0))) | |
452 return NULL; | |
453 | |
454 if (0 != curl_easy_setopt(handle, CURLOPT_URL, buf)) | |
455 goto error; | |
456 if (0 != curl_easy_setopt(handle, CURLOPT_TIMEOUT, 10L)) | |
457 goto error; | |
458 if (0 != curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, mfwrite)) | |
459 goto error; | |
460 if (0 != curl_easy_setopt(handle, CURLOPT_WRITEDATA, mf)) | |
461 goto error; | |
462 if (0 != curl_easy_setopt(handle, CURLOPT_HEADERFUNCTION, mfwrite)) | |
463 goto error; | |
464 if (0 != curl_easy_setopt(handle, CURLOPT_WRITEHEADER, headers)) | |
465 goto error; | |
466 if (0 != curl_easy_setopt(handle, CURLOPT_ERRORBUFFER, errbuf)) | |
467 goto error; | |
468 | |
469 /* Fetch! */ | |
470 if (0 != curl_easy_perform(handle)) | |
471 goto error; | |
472 | |
473 /* Report errors is approproate. 404 is silent as it may have just been | |
474 * a search via RAWDATA path, everything else is worth reporting. | |
475 */ | |
476 { | |
477 float version; | |
478 int response; | |
479 char nul = 0; | |
480 mfwrite(&nul, 1, 1, headers); | |
481 if (2 == sscanf(headers->data, "HTTP/%f %d", &version, &response)) { | |
482 if (response != 200) { | |
483 if (response != 404) | |
484 fprintf(stderr, "%.*s\n", | |
485 (int)headers->size, headers->data); | |
486 goto error; | |
487 } | |
488 } | |
489 } | |
490 | |
491 if (mftell(mf) == 0) | |
492 goto error; | |
493 | |
494 mfdestroy(headers); | |
495 | |
496 mrewind(mf); | |
497 return mf; | |
498 | |
499 error: | |
500 if (mf) | |
501 mfdestroy(mf); | |
502 if (headers) | |
503 mfdestroy(headers); | |
504 if (*errbuf) | |
505 fprintf(stderr, "%s\n", errbuf); | |
506 return NULL; | |
507 } | |
508 #endif | |
509 | |
510 /* | |
511 * Takes an SFF file in 'data' and edits the header to ensure | |
512 * that it has no index listed and only claims to contain a single entry. | |
513 * This isn't strictly necessary for the sff/sff.c reading code, but it is | |
514 * the 'Right Thing' to do. | |
515 * | |
516 * Returns an mFILE on success or NULL on failure. | |
517 */ | |
518 static mFILE *sff_single(char *data, size_t size) { | |
519 *(uint64_t *)(data+8) = be_int8(0); /* index offset */ | |
520 *(uint32_t *)(data+16) = be_int4(0); /* index size */ | |
521 *(uint32_t *)(data+20) = be_int4(1); /* number of reads */ | |
522 | |
523 return mfcreate(data, size); | |
524 } | |
525 | |
526 /* Hash (.hsh) format index searching for SFF files */ | |
527 static mFILE *sff_hash_query(char *sff, char *entry, FILE *fp) { | |
528 static HashFile *hf = NULL; | |
529 static char sff_copy[1024]; | |
530 static FILE *fp_copy = NULL; | |
531 char *data; | |
532 size_t size; | |
533 | |
534 /* Cache an open HashFile for fast accessing */ | |
535 if (strcmp(sff, sff_copy) != 0) { | |
536 if (hf) { | |
537 hf->afp = hf->hfp = NULL; /* will be closed by our parent */ | |
538 HashFileDestroy(hf); | |
539 } | |
540 fseek(fp, -4, SEEK_CUR); | |
541 if (NULL == (hf = HashFileFopen(fp))) | |
542 return NULL; | |
543 | |
544 strcpy(sff_copy, sff); | |
545 fp_copy = fp; | |
546 } | |
547 | |
548 data = HashFileExtract(hf, entry, &size); | |
549 | |
550 return data ? sff_single(data, size) : NULL; | |
551 } | |
552 | |
553 | |
554 /* | |
555 * getuint4_255 | |
556 * | |
557 * A function to convert a 4-byte TVF/SFF value into an integer, where | |
558 * the bytes are base 255 numbers. This is used to store the index offsets. | |
559 */ | |
560 static unsigned int getuint4_255(unsigned char *b) | |
561 { | |
562 return | |
563 ((unsigned int) b[0]) * 255 * 255 * 255 + | |
564 ((unsigned int) b[1]) * 255 * 255 + | |
565 ((unsigned int) b[2]) * 255 + | |
566 ((unsigned int) b[3]); | |
567 } | |
568 | |
569 /* | |
570 * 454 sorted format (.srt) index searching for SFF files. | |
571 * Uses a binary search. | |
572 * This function and getuint4_255 above are taken with permission | |
573 * from 454's getsff.c with the following licence: | |
574 * | |
575 * Copyright (c)[2001-2005] 454 Life Sciences Corporation. All Rights Reserved. | |
576 * | |
577 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
578 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
579 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |
580 * | |
581 * IN NO EVENT SHALL LICENSOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
582 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
583 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE. | |
584 * | |
585 * Permission to use, copy, modify and distribute this software and its | |
586 * documentation for any purpose is hereby granted without fee, provided | |
587 * that this copyright and notice appears in all copies. | |
588 */ | |
589 static mFILE *sff_sorted_query(char *sff, char *accno, FILE *fp, | |
590 uint32_t index_length) { | |
591 static unsigned char *index; | |
592 static char sff_copy[1024]; | |
593 unsigned char *us; | |
594 uint32_t start, end; | |
595 uint32_t offset; | |
596 char *data = NULL; | |
597 static char chdr[1024]; | |
598 static int chdrlen = 0, nflows = 0; | |
599 char rhdr[1024]; | |
600 int rhdrlen; | |
601 int nbases, dlen; | |
602 int bytes_per_flow = 2; | |
603 | |
604 /* Cache index if we're querying the same SFF file */ | |
605 if (strcmp(sff_copy, sff) != 0) { | |
606 if (index) | |
607 xfree(index); | |
608 if (NULL == (index = (unsigned char *)xmalloc(index_length))) | |
609 return NULL; | |
610 | |
611 if (index_length != fread(index, 1, index_length, fp)) { | |
612 xfree(index); | |
613 return NULL; | |
614 } | |
615 strcpy(sff_copy, sff); | |
616 | |
617 /* Read the common header too - minimal decoding necessary */ | |
618 fseek(fp, 0, SEEK_SET); | |
619 if (31 != fread(chdr, 1, 31, fp)) | |
620 return NULL; | |
621 chdrlen = be_int2(*(uint16_t *)(chdr+24)); | |
622 nflows = be_int2(*(uint16_t *)(chdr+28)); | |
623 if (chdrlen-31 != fread(chdr+31, 1, chdrlen-31, fp)) | |
624 return NULL; | |
625 } | |
626 | |
627 /* | |
628 * Perform a binary search of the index, stopping when the search | |
629 * region becomes relatively small. This assumes that no accession | |
630 * number is near 200 characters. | |
631 */ | |
632 start = 0; | |
633 end = index_length; | |
634 while (end - start > 200) { | |
635 uint32_t mid; | |
636 int val; | |
637 mid = (start + end) / 2; | |
638 | |
639 /* | |
640 * From the byte midpoint, scan backwards to the beginning of the | |
641 * index record that covers that byte midpoint. | |
642 */ | |
643 while (mid > start && index[mid-1] != 255) { | |
644 mid--; | |
645 } | |
646 val = strcmp(accno, (char *)(index+mid)); | |
647 | |
648 if (val == 0) { | |
649 break; | |
650 } else if (val < 0) { | |
651 end = mid; | |
652 } else { | |
653 start = mid; | |
654 } | |
655 } | |
656 | |
657 /* | |
658 * Scan through the small search region, looking for the accno. | |
659 */ | |
660 while (start < end) { | |
661 if (strcmp(accno, (char *)(index+start)) == 0) { | |
662 /* | |
663 * If the accno is found, skip the accno characters, | |
664 * then get the record offset. | |
665 */ | |
666 for (us=index+start; *us; us++,start++) ; | |
667 us++; | |
668 start++; | |
669 | |
670 offset = getuint4_255(us); | |
671 if (us[4] != 255) { | |
672 return NULL; | |
673 } | |
674 | |
675 /* | |
676 * The original getsff.c here computed the record size by | |
677 * looking at the next index item and comparing it's offset to | |
678 * this one, or the end of file position if this is the last | |
679 * item. This has two problems: | |
680 * 1: It means the index itself cannot be added to the end of | |
681 * the file. | |
682 * 2: It means that we cannot simply add an index to a SFF | |
683 * file without also reordering all of the items within it. | |
684 * | |
685 * We solve this by reading the read header to work out the | |
686 * object size instead. | |
687 */ | |
688 break; | |
689 } | |
690 | |
691 /* | |
692 * Skip to the beginning of the next index element. | |
693 */ | |
694 while (start < end && index[start] != 255) { | |
695 start++; | |
696 } | |
697 start++; | |
698 } | |
699 | |
700 /* | |
701 * Now offset indicates the position of the SFF entry. Read and decode | |
702 * header to get data length. Then read this too. | |
703 */ | |
704 fseek(fp, offset, SEEK_SET); | |
705 if (16 != fread(rhdr, 1, 16, fp)) | |
706 return NULL; | |
707 | |
708 rhdrlen = be_int2(*(uint16_t *)rhdr); | |
709 nbases = be_int4(*(uint32_t *)(rhdr+4)); | |
710 | |
711 if (rhdrlen-16 != fread(rhdr+16, 1, rhdrlen-16, fp)) | |
712 return NULL; | |
713 dlen = (nflows * bytes_per_flow + nbases * 3 + 7) & ~7; | |
714 | |
715 /* Built up the fake SFF entry */ | |
716 if (NULL == (data = (char *)xmalloc(chdrlen + rhdrlen + dlen))) | |
717 return NULL; | |
718 | |
719 memcpy(data, chdr, chdrlen); | |
720 memcpy(data + chdrlen, rhdr, rhdrlen); | |
721 if (dlen != fread(data + chdrlen + rhdrlen, 1, dlen, fp)) { | |
722 xfree(data); | |
723 return NULL; | |
724 } | |
725 | |
726 /* Convert to mFILE */ | |
727 return sff_single(data, chdrlen + rhdrlen + dlen); | |
728 } | |
729 | |
730 | |
731 /* | |
732 * This returns an mFILE containing an SFF entry. | |
733 * | |
734 * This does the minimal decoding necessary to skip through the SFF | |
735 * container to find an entry. In this respect it is a semi-duplication | |
736 * of sff/sff.[ch], but implemented for efficiency. | |
737 * | |
738 * Having found an entry it packs the common header, the read specific | |
739 * header and the read data into a single block of memory and returns this | |
740 * as an mFILE. In essence it produces a single-read SFF archive. This | |
741 * is then decoded by the normal sff parsing code representing a small | |
742 * amount of redundancy, but one which is swamped by the I/O time. | |
743 */ | |
744 static mFILE *find_file_sff(char *entry, char *sff) { | |
745 static FILE *fp = NULL; | |
746 static char sff_copy[1024]; | |
747 char chdr[65536], rhdr[65536]; /* generous, but worst case */ | |
748 uint32_t nkey, nflows, chdrlen, rhdrlen, dlen, magic; | |
749 uint64_t file_pos; | |
750 static uint64_t index_offset = 0; | |
751 static uint32_t index_length = 0; | |
752 static char index_format[8]; | |
753 uint32_t nreads, i; | |
754 size_t entry_len = strlen(entry); | |
755 int bytes_per_flow = 2; | |
756 char *fake_file; | |
757 | |
758 /* | |
759 * Check cached information so rapid queries to the same archive are | |
760 * fast. | |
761 * ASSUMPTION: we won't externally replace the sff file with another of | |
762 * the same name. | |
763 */ | |
764 if (strcmp(sff, sff_copy) == 0) { | |
765 if (memcmp(index_format, ".hsh1.00", 8) == 0) { | |
766 return sff_hash_query(sff, entry, fp); | |
767 } else if (memcmp(index_format, ".srt1.00", 8) == 0) { | |
768 return sff_sorted_query(sff, entry, fp, index_length-8); | |
769 } | |
770 } | |
771 | |
772 if (fp) | |
773 fclose(fp); | |
774 | |
775 strcpy(sff_copy, sff); | |
776 *index_format = 0; | |
777 | |
778 | |
779 /* Read the common header */ | |
780 if (NULL == (fp = fopen(sff, "rb"))) | |
781 return NULL; | |
782 if (31 != fread(chdr, 1, 31, fp)) | |
783 return NULL; | |
784 | |
785 /* Check magic & vers: TODO */ | |
786 magic = be_int4(*(uint32_t *)chdr); | |
787 if (magic != SFF_MAGIC) | |
788 return NULL; | |
789 if (memcmp(chdr+4, SFF_VERSION, 4) != 0) | |
790 return NULL; | |
791 | |
792 /* If we have an index, use it, otherwise search linearly */ | |
793 index_offset = be_int8(*(uint64_t *)(chdr+8)); | |
794 index_length = be_int4(*(uint32_t *)(chdr+16)); | |
795 if (index_length != 0) { | |
796 long orig_pos = ftell(fp); | |
797 fseek(fp, index_offset, SEEK_SET); | |
798 fread(index_format, 1, 8, fp); | |
799 | |
800 if (memcmp(index_format, ".hsh1.00", 8) == 0) { | |
801 /* HASH index v1.00 */ | |
802 return sff_hash_query(sff, entry, fp); | |
803 | |
804 } else if (memcmp(index_format, ".srt1.00", 8) == 0) { | |
805 /* 454 sorted v1.00 */ | |
806 return sff_sorted_query(sff, entry, fp, index_length-8); | |
807 } else { | |
808 /* Unknown index: revert back to a slow linear scan */ | |
809 fseek(fp, orig_pos, SEEK_SET); | |
810 } | |
811 } | |
812 | |
813 nreads = be_int4(*(uint32_t *)(chdr+20)); | |
814 chdrlen = be_int2(*(uint16_t *)(chdr+24)); | |
815 nkey = be_int2(*(uint16_t *)(chdr+26)); | |
816 nflows = be_int2(*(uint16_t *)(chdr+28)); | |
817 | |
818 /* Read the remainder of the header */ | |
819 if (chdrlen-31 != fread(chdr+31, 1, chdrlen-31, fp)) | |
820 return NULL; | |
821 | |
822 file_pos = chdrlen; | |
823 | |
824 /* Loop until we find the correct entry */ | |
825 for (i = 0; i < nreads; i++) { | |
826 uint16_t name_len; | |
827 uint32_t nbases; | |
828 | |
829 /* Index could be between common header and first read - skip */ | |
830 if (file_pos == index_offset) { | |
831 fseek(fp, index_length, SEEK_CUR); | |
832 file_pos += index_length; | |
833 } | |
834 | |
835 /* Read 16 bytes to get name length */ | |
836 if (16 != fread(rhdr, 1, 16, fp)) | |
837 return NULL; | |
838 rhdrlen = be_int2(*(uint16_t *)rhdr); | |
839 name_len = be_int2(*(uint16_t *)(rhdr+2)); | |
840 nbases = be_int4(*(uint32_t *)(rhdr+4)); | |
841 | |
842 /* Read the rest of the header */ | |
843 if (rhdrlen-16 != fread(rhdr+16, 1, rhdrlen-16, fp)) | |
844 return NULL; | |
845 | |
846 file_pos += rhdrlen; | |
847 | |
848 dlen = (nflows * bytes_per_flow + nbases * 3 + 7) & ~7; | |
849 | |
850 if (name_len == entry_len && 0 == memcmp(rhdr+16, entry, entry_len)) | |
851 break; | |
852 | |
853 /* This is not the read you are looking for... */ | |
854 fseek(fp, dlen, SEEK_CUR); | |
855 } | |
856 | |
857 if (i == nreads) { | |
858 /* Not found */ | |
859 return NULL; | |
860 } | |
861 | |
862 /* | |
863 * Although we've decoded some bits already, we take the more modular | |
864 * approach of packing the sections together and passing the entire | |
865 * data structure off as a single-read SFF file to be decoded fully | |
866 * by the sff reading code. | |
867 */ | |
868 if (NULL == (fake_file = (char *)xmalloc(chdrlen + rhdrlen + dlen))) | |
869 return NULL; | |
870 | |
871 memcpy(fake_file, chdr, chdrlen); | |
872 memcpy(fake_file+chdrlen, rhdr, rhdrlen); | |
873 if (dlen != fread(fake_file+chdrlen+rhdrlen, 1, dlen, fp)) { | |
874 xfree(fake_file); | |
875 return NULL; | |
876 } | |
877 | |
878 /* Convert to an mFILE and return */ | |
879 return sff_single(fake_file, chdrlen+rhdrlen+dlen); | |
880 } | |
881 | |
882 /* | |
883 * Searches for file in the directory 'dirname'. If it finds it, it opens | |
884 * it. This also searches for compressed versions of the file in dirname | |
885 * too. | |
886 * | |
887 * Returns mFILE pointer if found | |
888 * NULL if not | |
889 */ | |
890 static mFILE *find_file_dir(char *file, char *dirname) { | |
891 char path[PATH_MAX+1], path2[PATH_MAX+1]; | |
892 size_t len = strlen(dirname); | |
893 char *cp; | |
894 | |
895 if (dirname[len-1] == '/') | |
896 len--; | |
897 | |
898 /* Special case for "./" or absolute filenames */ | |
899 if (*file == '/' || (len==1 && *dirname == '.')) | |
900 sprintf(path, "%s", file); | |
901 else | |
902 sprintf(path, "%.*s/%s", (int)len, dirname, file); | |
903 | |
904 if (is_file(path)) { | |
905 return mfopen(path, "rb"); | |
906 } | |
907 | |
908 /* | |
909 * Given a pathname /a/b/c if a/b is a file and not a directory then | |
910 * we'd get an ENOTDIR error. Instead we assume that a/b is an archive | |
911 * and we attempt to work out what type by reading the first and last | |
912 * bits of the file. | |
913 */ | |
914 if (cp = strrchr(file, '/')) { | |
915 strcpy(path2, path); /* path contains / too as it's from file */ | |
916 *strrchr(path2, '/') = 0; | |
917 | |
918 if (is_file(path2)) { | |
919 /* Open the archive to test for magic numbers */ | |
920 char magic[8]; | |
921 FILE *fp; | |
922 enum archive_type_t { | |
923 NONE, HASH, TAR, SFF, SRF | |
924 } type = NONE; | |
925 | |
926 if (NULL == (fp = fopen(path2, "rb"))) | |
927 return NULL; | |
928 memcpy(magic, "\0\0\0\0\0\0", 4); | |
929 fread(magic, 1, 4, fp); | |
930 | |
931 /* .hsh or .sff at start */ | |
932 if (memcmp(magic, ".hsh", 4) == 0) | |
933 type = HASH; | |
934 else if (memcmp(magic, ".sff", 4) == 0) | |
935 type = SFF; | |
936 | |
937 /* Or .hsh or Ihsh at the end */ | |
938 if (NONE == type) { | |
939 fseek(fp, -16, SEEK_END); | |
940 fread(magic, 1, 8, fp); | |
941 if (memcmp(magic+4, ".hsh", 4) == 0) | |
942 type = HASH; | |
943 else if (memcmp(magic, "Ihsh", 4) == 0) | |
944 type = SRF; | |
945 } | |
946 | |
947 /* or ustar 257 bytes in to indicate un-hashed tar */ | |
948 if (NONE == type) { | |
949 fseek(fp, 257, SEEK_SET); | |
950 fread(magic, 1, 5, fp); | |
951 if (memcmp(magic, "ustar", 5) == 0) | |
952 type = TAR; | |
953 } | |
954 fclose(fp); | |
955 | |
956 switch (type) { | |
957 case HASH: | |
958 return find_file_hash(cp+1, path2); | |
959 case TAR: | |
960 return find_file_tar(cp+1, path2, 0); | |
961 case SFF: | |
962 return find_file_sff(cp+1, path2); | |
963 case SRF: | |
964 return find_file_srf(cp+1, path2); | |
965 case NONE: | |
966 break; | |
967 } | |
968 | |
969 return NULL; | |
970 } | |
971 } | |
972 | |
973 return NULL; | |
974 } | |
975 | |
976 /* | |
977 * ------------------------------------------------------------------------ | |
978 * Public functions below. | |
979 */ | |
980 | |
981 /* | |
982 * Opens a trace file named 'file'. This is initially looked for as a | |
983 * pathname relative to a file named "relative_to". This may (for | |
984 * example) be the name of an experiment file referencing the trace | |
985 * file. In this case by passing relative_to as the experiment file | |
986 * filename the trace file will be picked up in the same directory as | |
987 * the experiment file. Relative_to may be supplied as NULL. | |
988 * | |
989 * 'file' is looked for at relative_to, then the current directory, and then | |
990 * all of the locations listed in 'path' (which is a colon separated list). | |
991 * If 'path' is NULL it uses the RAWDATA environment variable instead. | |
992 * | |
993 * Returns a mFILE pointer when found. | |
994 * NULL otherwise. | |
995 */ | |
996 mFILE *open_path_mfile(char *file, char *path, char *relative_to) { | |
997 char *newsearch; | |
998 char *ele; | |
999 mFILE *fp; | |
1000 | |
1001 /* Use path first */ | |
1002 if (!path) | |
1003 path = getenv("RAWDATA"); | |
1004 if (NULL == (newsearch = tokenise_search_path(path))) | |
1005 return NULL; | |
1006 | |
1007 /* | |
1008 * Step through the search path testing out each component. | |
1009 * We now look through each path element treating some prefixes as | |
1010 * special, otherwise we treat the element as a directory. | |
1011 */ | |
1012 for (ele = newsearch; *ele; ele += strlen(ele)+1) { | |
1013 int i; | |
1014 char *suffix[6] = {"", ".gz", ".bz2", ".sz", ".Z", ".bz2"}; | |
1015 for (i = 0; i < 6; i++) { | |
1016 char file2[1024]; | |
1017 char *ele2; | |
1018 int valid = 1; | |
1019 | |
1020 /* | |
1021 * '|' prefixing a path component indicates that we do not | |
1022 * wish to perform the compression extension searching in that | |
1023 * location. | |
1024 */ | |
1025 if (*ele == '|') { | |
1026 ele2 = ele+1; | |
1027 valid = (i == 0); | |
1028 } else { | |
1029 ele2 = ele; | |
1030 } | |
1031 | |
1032 sprintf(file2, "%s%s", file, suffix[i]); | |
1033 | |
1034 if (0 == strncmp(ele2, "TAR=", 4)) { | |
1035 if (valid && (fp = find_file_tar(file2, ele2+4, 0))) { | |
1036 free(newsearch); | |
1037 return fp; | |
1038 } | |
1039 | |
1040 } else if (0 == strncmp(ele2, "HASH=", 5)) { | |
1041 if (valid && (fp = find_file_hash(file2, ele2+5))) { | |
1042 free(newsearch); | |
1043 return fp; | |
1044 } | |
1045 #ifdef TRACE_ARCHIVE | |
1046 } else if (0 == strncmp(ele2, "ARC=", 4)) { | |
1047 if (valid && (fp = find_file_archive(file2, ele2+4))) { | |
1048 free(newsearch); | |
1049 return fp; | |
1050 } | |
1051 #endif | |
1052 #if defined(USE_WGET) || defined(HAVE_LIBCURL) | |
1053 } else if (0 == strncmp(ele2, "URL=", 4)) { | |
1054 if (valid && (fp = find_file_url(file2, ele2+4))) { | |
1055 free(newsearch); | |
1056 return fp; | |
1057 } | |
1058 #endif | |
1059 } else if (0 == strncmp(ele2, "SFF=", 4)) { | |
1060 if (valid && (fp = find_file_sff(file2, ele2+4))) { | |
1061 free(newsearch); | |
1062 return fp; | |
1063 } | |
1064 | |
1065 } else if (0 == strncmp(ele2, "SRF=", 4)) { | |
1066 if (valid && (fp = find_file_srf(file2, ele2+4))) { | |
1067 free(newsearch); | |
1068 return fp; | |
1069 } | |
1070 | |
1071 } else { | |
1072 if (valid && (fp = find_file_dir(file2, ele2))) { | |
1073 free(newsearch); | |
1074 return fp; | |
1075 } | |
1076 } | |
1077 } | |
1078 } | |
1079 | |
1080 free(newsearch); | |
1081 | |
1082 /* Look in the same location as the incoming 'relative_to' filename */ | |
1083 if (relative_to) { | |
1084 char *cp; | |
1085 char relative_path[PATH_MAX+1]; | |
1086 strcpy(relative_path, relative_to); | |
1087 if (cp = strrchr(relative_path, '/')) | |
1088 *cp = 0; | |
1089 if (fp = find_file_dir(file, relative_path)) | |
1090 return fp; | |
1091 } | |
1092 | |
1093 return NULL; | |
1094 } | |
1095 | |
1096 FILE *open_path_file(char *file, char *path, char *relative_to) { | |
1097 mFILE *mf = open_path_mfile(file, path, relative_to); | |
1098 FILE *fp; | |
1099 | |
1100 if (!mf) | |
1101 return NULL; | |
1102 | |
1103 if (mf->fp) | |
1104 return mf->fp; | |
1105 | |
1106 /* Secure temporary file generation */ | |
1107 if (NULL == (fp = tmpfile())) | |
1108 return NULL; | |
1109 | |
1110 /* Copy the data */ | |
1111 fwrite(mf->data, 1, mf->size, fp); | |
1112 rewind(fp); | |
1113 mfclose(mf); | |
1114 | |
1115 return fp; | |
1116 } | |
1117 | |
1118 static char *exp_path = NULL; | |
1119 static char *trace_path = NULL; | |
1120 | |
1121 void iolib_set_trace_path(char *path) { trace_path = path; } | |
1122 char *iolib_get_trace_path(void) { return trace_path; } | |
1123 void iolib_set_exp_path (char *path) { exp_path = path; } | |
1124 char *iolib_get_exp_path (void) { return exp_path; } | |
1125 | |
1126 /* | |
1127 * Trace file functions: uses TRACE_PATH environment variable. | |
1128 */ | |
1129 mFILE *open_trace_mfile(char *file, char *rel_to) { | |
1130 return open_path_mfile(file, trace_path ? trace_path | |
1131 : getenv("TRACE_PATH"), rel_to); | |
1132 } | |
1133 | |
1134 FILE *open_trace_file(char *file, char *rel_to) { | |
1135 return open_path_file(file, trace_path ? trace_path | |
1136 : getenv("TRACE_PATH"), rel_to); | |
1137 } | |
1138 | |
1139 /* | |
1140 * Trace file functions: uses EXP_PATH environment variable. | |
1141 */ | |
1142 mFILE *open_exp_mfile(char *file, char *relative_to) { | |
1143 return open_path_mfile(file, exp_path ? exp_path | |
1144 : getenv("EXP_PATH"), relative_to); | |
1145 } | |
1146 | |
1147 FILE *open_exp_file(char *file, char *relative_to) { | |
1148 return open_path_file(file, exp_path ? exp_path | |
1149 : getenv("EXP_PATH"), relative_to); | |
1150 } | |
1151 |