comparison srf2fastq/io_lib-1.12.2/io_lib/compression.h @ 0:d901c9f41a6a default tip

Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author dawe
date Tue, 07 Jun 2011 17:48:05 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d901c9f41a6a
1 #ifndef _COMPRESSION_H_
2 #define _COMPRESSION_H_
3
4 #include "io_lib/os.h"
5 #include <zlib.h>
6
7 #ifdef __cplusplus
8 extern "C" {
9 #endif
10
11 /*
12 * zlib_huff()
13 *
14 * Compresses data using huffman encoding, as implemented by zlib.
15 *
16 * Arguments:
17 * uncomp Uncompressed input data
18 * uncomp_len Length of uncomp data
19 * comp_len Output: length of compressed data
20 *
21 * Returns:
22 * Compressed data if successful
23 * NULL if not successful
24 */
25 char *zlib_huff(char *uncomp, int uncomp_len, int strategy, int *comp_len);
26
27 /*
28 * zlib_dehuff()
29 *
30 * Uncompresses data using huffman encoding, as implemented by zlib.
31 *
32 * Arguments:
33 * comp Compressed input data
34 * comp_len Length of comp data
35 * uncomp_len Output: length of uncompressed data
36 *
37 * Returns:
38 * Uncompressed data if successful
39 * NULL if not successful
40 */
41 char *zlib_dehuff(char *comp, int comp_len, int *uncomp_len);
42
43 /*
44 * zlib_dehuff2()
45 *
46 * Uncompresses data using huffman encoding, as implemented by zlib.
47 * Similar to zlib_dehuff above, but with the following differences:
48 *
49 * 1) It pastes together the zlib stream from two components; comp1+comp2
50 * with the last byte of comp1 overlapping (ORed) with the first byte
51 * of comp2. This allows for separation of the huffman codes from
52 * the compressed data itself.
53 * 2) It uses the raw Deflate format rather than Zlib's wrapping of it.
54 * 3) It uses an EOF symbol to mark the end rather than encoding the
55 * uncompressed size in the header
56 *
57 *
58 * Arguments:
59 * comp1 Compressed input data part 1
60 * comp1_len Length of comp1 data
61 * comp2 Compressed input data part 2
62 * comp2_len Length of comp2 data
63 * uncomp_len Output: length of uncompressed data
64 *
65 * Returns:
66 * Uncompressed data if successful
67 * NULL if not successful
68 */
69 char *zlib_dehuff2(char *comp1, int comp1_len,
70 char *comp2, int comp2_len,
71 int *uncomp_len);
72
73 /*
74 * Run length encoding.
75 *
76 * Any run of 3 or more identical characters (up to 255 in a row) are replaced
77 * by a 'guard' byte followed by the number of characters followed by
78 * the character value itself.
79 * Any single guard value in the input is escaped using 'guard 0'.
80 *
81 * Specifying guard as -1 will automatically pick one of the least used
82 * characters in the input as the guard.
83 *
84 * Arguments:
85 * uncomp Input data
86 * uncomp_len Length of input data 'uncomp'
87 * guard Guard byte - used to encode "N" copies of data
88 * comp_len Output: length of compressed data
89 *
90 * Returns:
91 * Compressed data if successful
92 * NULL if not successful
93 */
94 char *rle(char *uncomp, int uncomp_len, int guard, int *comp_len);
95
96 /*
97 * Reverses run length encoding.
98 *
99 * Arguments:
100 * comp Compressed input data
101 * comp_len Length of comp data
102 * uncomp_len Output: length of uncompressed data
103 *
104 * Returns:
105 * Uncompressed data if successful
106 * NULL if not successful
107 */
108 char *unrle(char *comp, int comp_len, int *uncomp_len);
109
110 /*
111 * Mutli-byte run length encoding.
112 *
113 * Any run of 3 or more identical characters (up to 255 in a row) are replaced
114 * by a 'guard' byte followed by the number of characters followed by
115 * the character value itself.
116 * Any single guard value in the input is escaped using 'guard 0'.
117 *
118 * Specifying guard as -1 will automatically pick one of the least used
119 * characters in the input as the guard.
120 *
121 * Arguments:
122 * uncomp Input data
123 * uncomp_len Length of input data 'uncomp'
124 * guard Guard byte - used to encode "N" copies of data
125 * rsz Size of blocks to compare for run checking.
126 * comp_len Output: length of compressed data
127 *
128 * Returns:
129 * Compressed data if successful
130 * NULL if not successful
131 */
132 char *xrle(char *uncomp, int uncomp_len, int guard, int rsz, int *comp_len);
133
134 /*
135 * Reverses multi-byte run length encoding.
136 *
137 * Arguments:
138 * comp Compressed input data
139 * comp_len Length of comp data
140 * uncomp_len Output: length of uncompressed data
141 *
142 * Returns:
143 * Uncompressed data if successful
144 * NULL if not successful
145 */
146 char *unxrle(char *comp, int comp_len, int *uncomp_len);
147
148 /*
149 * Mutli-byte run length encoding.
150 *
151 * Steps along in words of size 'rsz'. Unlike XRLE above this does run-length
152 * encoding by writing out an additional "length" word every time 2 or more
153 * words in a row are spotted. This removes the need for a guard byte.
154 *
155 * Additionally this method ensures that both input and output formats remain
156 * aligned on words of size 'rsz'.
157 *
158 * Arguments:
159 * uncomp Input data
160 * uncomp_len Length of input data 'uncomp'
161 * rsz Size of blocks to compare for run checking.
162 * comp_len Output: length of compressed data
163 *
164 * Returns:
165 * Compressed data if successful
166 * NULL if not successful
167 */
168 char *xrle2(char *uncomp, int uncomp_len, int rsz, int *comp_len);
169
170 /*
171 * Reverses multi-byte run length encoding (xrle_new).
172 *
173 * Arguments:
174 * comp Compressed input data
175 * comp_len Length of comp data
176 * uncomp_len Output: length of uncompressed data
177 *
178 * Returns:
179 * Uncompressed data if successful
180 * NULL if not successful
181 */
182 char *unxrle2(char *comp, int comp_len, int *uncomp_len);
183
184 /*
185 * decorrelate1()
186 *
187 * Produce successive deltas from a 1-byte array.
188 *
189 * Arguments:
190 * uncomp Uncompressed data
191 * uncomp_len Length of uncompressed data
192 * level Differencing level (must be 1, 2 or 3)
193 * comp_len Return: where to store new compressed length
194 *
195 * Returns:
196 * Success: A decorrelated buffer (malloced)
197 * Failure: NULL
198 */
199 char *decorrelate1(char *uncomp, int uncomp_len, int level, int *comp_len);
200 char *decorrelate1dyn(char *s_uncomp, int uncomp_len, int *comp_len);
201
202 /*
203 * recorrelate1()
204 *
205 * The reverse of decorrelate1()
206 *
207 * Arguments:
208 * comp Compressed input data
209 * comp_len Length of comp data
210 * uncomp_len Output: length of uncompressed data
211 *
212 * Returns:
213 * Success: uncompressed data
214 * Failure: NULL
215 */
216 char *recorrelate1(char *comp, int comp_len, int *uncomp_len);
217
218 /*
219 * decorrelate2()
220 *
221 * Produce successive deltas from a 2-byte array (big endian)
222 *
223 * Arguments:
224 * uncomp Uncompressed data
225 * uncomp_len Length of uncompressed data
226 * level Differencing level (must be 1, 2 or 3)
227 * comp_len Return: where to store new compressed length
228 *
229 * Returns:
230 * Success: A decorrelated buffer (malloced)
231 * Failure: NULL
232 */
233 char *decorrelate2(char *uncomp, int uncomp_len, int level, int *comp_len);
234 char *decorrelate2dyn(char *s_uncomp, int uncomp_len, int *comp_len);
235
236 /*
237 * recorrelate2()
238 *
239 * The reverse of decorrelate2()
240 *
241 * Arguments:
242 * comp Compressed input data
243 * comp_len Length of comp data
244 * uncomp_len Output: length of uncompressed data
245 *
246 * Returns:
247 * Success: uncompressed data
248 * Failure: NULL
249 */
250 char *recorrelate2(char *comp, int comp_len, int *uncomp_len);
251
252 /*
253 * decorrelate4()
254 *
255 * Produce successive deltas from a 4-byte array (big endian)
256 *
257 * Arguments:
258 * uncomp Uncompressed data
259 * uncomp_len Length of uncompressed data
260 * level Differencing level (must be 1, 2 or 3)
261 * comp_len Return: where to store new compressed length
262 *
263 * Returns:
264 * Success: A decorrelated buffer (malloced)
265 * Failure: NULL
266 */
267 char *decorrelate4(char *uncomp, int uncomp_len, int level, int *comp_len);
268
269 /*
270 * recorrelate4()
271 *
272 * The reverse of decorrelate4()
273 *
274 * Arguments:
275 * comp Compressed input data
276 * comp_len Length of comp data
277 * uncomp_len Output: length of uncompressed data
278 *
279 * Returns:
280 * Success: uncompressed data
281 * Failure: NULL
282 */
283 char *recorrelate4(char *comp, int comp_len, int *uncomp_len);
284
285 /*
286 * shrink_16to8()
287 *
288 * Stores an array of 16-bit (big endian) array elements in an 8-bit array.
289 * We assume that most 16-bit elements encode numbers that fit in an 8-bit
290 * value. When not possible, we store a marker followed by the 16-bit value
291 * stored as multiple 8-bit values.
292 *
293 * uncomp Uncompressed data
294 * uncomp_len Length of uncompressed data (in bytes)
295 * comp_len Return: where to store new compressed length
296 *
297 * Returns:
298 * Success: An 8-bit array (malloced)
299 * Failure: NULL
300 */
301 char *shrink_16to8(char *uncomp, int uncomp_len, int *comp_len);
302
303 /*
304 * expand_8to16()
305 *
306 * The opposite of the shrink_16to8() function.
307 *
308 * comp Compressed input data
309 * comp_len Length of comp data (in bytes)
310 * uncomp_len Output: length of uncompressed data (in bytes)
311 *
312 * Returns:
313 * Success: Uncompressed data (char *)
314 * Failure: NULL
315 */
316 char *expand_8to16(char *comp, int comp_len, int *uncomp_len);
317
318 /*
319 * shrink_32to8()
320 *
321 * Stores an array of 32-bit (big endian) array elements in an 8-bit array.
322 * We assume that most 32-bit elements encode numbers that fit in an 8-bit
323 * value. When not possible, we store a marker followed by the 32-bit value
324 * stored as multiple 8-bit values.
325 *
326 * uncomp Uncompressed data
327 * uncomp_len Length of uncompressed data (in bytes)
328 * comp_len Return: where to store new compressed length
329 *
330 * Returns:
331 * Success: An 8-bit array (malloced)
332 * Failure: NULL
333 */
334 char *shrink_32to8(char *uncomp, int uncomp_len, int *comp_len);
335
336 /*
337 * expand_8to32()
338 *
339 * The opposite of the shrink_32to8() function.
340 *
341 * comp Compressed input data
342 * comp_len Length of comp data (in bytes)
343 * uncomp_len Output: length of uncompressed data (in bytes)
344 *
345 * Returns:
346 * Success: Uncompressed data (char *)
347 * Failure: NULL
348 */
349 char *expand_8to32(char *comp, int comp_len, int *uncomp_len);
350
351 char *follow1(char *s_uncomp,
352 int uncomp_len,
353 int *comp_len);
354
355 char *unfollow1(char *s_comp,
356 int comp_len,
357 int *uncomp_len);
358
359 char *ichebcomp(char *uncomp,
360 int uncomp_len,
361 int *data_len);
362
363 char *ichebuncomp(char *comp,
364 int comp_len,
365 int *uncomp_len);
366
367 /*
368 * This is a LOSSY compression. It replaces N with 10 * log2(N).
369 */
370 char *log2_data(char *x_uncomp,
371 int uncomp_len,
372 int *comp_len);
373
374 char *unlog2_data(char *x_comp,
375 int comp_len,
376 int *uncomp_len);
377
378 /*
379 * Implements compression using a set of static huffman codes stored using
380 * the Deflate algorithm (and so in this respect it's similar to zlib).
381 *
382 * The huffman codes though can be previously stored in the ztr object
383 * using ztr_add_hcode(). "cset" indicates which numbered stored huffman
384 * code set is to be used, or passing zero will use inline codes (ie they
385 * are stored in the data stream itself, just as in standard deflate).
386 *
387 * Arguments:
388 * ztr ztr_t pointer; used to find stored code-sets
389 * uncomp The uncompressed input data
390 * uncomp_len Length of uncomp
391 * cset Stored code-set number, zero for inline
392 * recsz Record size - only used when cset == 0.
393 * comp_len Output: length of compressed data
394 *
395 * Returns:
396 * Compressed data stream if successful + comp_len
397 * NULL on failure
398 */
399 char *sthuff(ztr_t *ztr, char *uncomp, int uncomp_len,
400 int cset, int recsz, int *comp_len);
401 char *unsthuff(ztr_t *ztr, char *comp, int comp_len, int *uncomp_len);
402
403 /*
404 * Reorders quality data from its RAW format to an interleaved 4-byte
405 * aligned format.
406 *
407 * Starting with sequence A1 C2 G3 the raw format is quality of called
408 * bases followed by quality of remaining bases:
409 * 0 (RAW format)
410 * Q(A1) Q(C2) Q(G3)
411 * Q(C2) Q(A2) Q(A3)
412 * Q(G2) Q(G2) Q(C3)
413 * Q(T2) Q(T2) Q(T3)
414 *
415 * We reorder it to:
416 * ZTR_FORM_QSHIFT <any> <any> 0(raw)
417 * Q(A1) Q(C1) Q(G1) Q(T1)
418 * Q(C2) Q(A2) Q(G2) Q(T2)
419 * Q(G3) Q(A3) Q(C3) Q(T3)
420 *
421 * Returns shifted data on success
422 * NULL on failure
423 */
424 char *qshift(char *qold, int qlen, int *new_len);
425 char *unqshift(char *qold, int qlen, int *new_len);
426
427 /*
428 * Given a sequence ACTG this shifts trace data from the order:
429 *
430 * A1A2A3A4 C1C2C3C4 G1G2G3G4 T1T2T3T4
431 *
432 * to
433 *
434 * A1C1G1T1 C2A2G2T2 T3A3C3G3 G4C4C4T4
435 *
436 * Ie for each base it ouputs the signal for the called base first
437 * followed by the remaining 3 signals in A,C,G,T order (minus the
438 * called signal already output).
439 */
440 char *tshift(ztr_t *ztr, char *told_c, int tlen, int *new_len);
441 char *untshift(ztr_t *ztr, char *told_c, int tlen, int *new_len);
442
443 #ifdef __cplusplus
444 }
445 #endif
446
447 #endif /* _COMPRESSION_H_ */