diff srf2fastq/io_lib-1.12.2/io_lib/compression.h @ 0:d901c9f41a6a default tip

Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author dawe
date Tue, 07 Jun 2011 17:48:05 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/srf2fastq/io_lib-1.12.2/io_lib/compression.h	Tue Jun 07 17:48:05 2011 -0400
@@ -0,0 +1,447 @@
+#ifndef _COMPRESSION_H_
+#define _COMPRESSION_H_
+
+#include "io_lib/os.h"
+#include <zlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * zlib_huff()
+ *
+ * Compresses data using huffman encoding, as implemented by zlib.
+ *
+ * Arguments:
+ *	uncomp		Uncompressed input data
+ *	uncomp_len	Length of uncomp data
+ *	comp_len	Output: length of compressed data
+ *
+ * Returns:
+ *	Compressed data if successful
+ *	NULL if not successful
+ */
+char *zlib_huff(char *uncomp, int uncomp_len, int strategy, int *comp_len);
+
+/*
+ * zlib_dehuff()
+ *
+ * Uncompresses data using huffman encoding, as implemented by zlib.
+ *
+ * Arguments:
+ *	comp		Compressed input data
+ *	comp_len	Length of comp data
+ *	uncomp_len	Output: length of uncompressed data
+ *
+ * Returns:
+ *	Uncompressed data if successful
+ *	NULL if not successful
+ */
+char *zlib_dehuff(char *comp, int comp_len, int *uncomp_len);
+
+/*
+ * zlib_dehuff2()
+ *
+ * Uncompresses data using huffman encoding, as implemented by zlib.
+ * Similar to zlib_dehuff above, but with the following differences:
+ *
+ * 1) It pastes together the zlib stream from two components; comp1+comp2
+ *    with the last byte of comp1 overlapping (ORed) with the first byte
+ *    of comp2. This allows for separation of the huffman codes from
+ *    the compressed data itself.
+ * 2) It uses the raw Deflate format rather than Zlib's wrapping of it.
+ * 3) It uses an EOF symbol to mark the end rather than encoding the
+ *    uncompressed size in the header
+ * 
+ *
+ * Arguments:
+ *	comp1		Compressed input data part 1
+ *	comp1_len	Length of comp1 data
+ *	comp2		Compressed input data part 2
+ *	comp2_len	Length of comp2 data
+ *	uncomp_len	Output: length of uncompressed data
+ *
+ * Returns:
+ *	Uncompressed data if successful
+ *	NULL if not successful
+ */
+char *zlib_dehuff2(char *comp1, int comp1_len,
+		   char *comp2, int comp2_len,
+		   int *uncomp_len);
+
+/*
+ * Run length encoding.
+ *
+ * Any run of 3 or more identical characters (up to 255 in a row) are replaced
+ * by a 'guard' byte followed by the number of characters followed by
+ * the character value itself.
+ * Any single guard value in the input is escaped using 'guard 0'.
+ *
+ * Specifying guard as -1 will automatically pick one of the least used
+ * characters in the input as the guard.
+ *
+ * Arguments:
+ *	uncomp		Input data
+ *	uncomp_len	Length of input data 'uncomp'
+ *	guard		Guard byte - used to encode "N" copies of data
+ *	comp_len	Output: length of compressed data
+ *
+ * Returns:
+ *	Compressed data if successful
+ *	NULL if not successful
+ */
+char *rle(char *uncomp, int uncomp_len, int guard, int *comp_len);
+
+/*
+ * Reverses run length encoding.
+ *
+ * Arguments:
+ *	comp		Compressed input data
+ *	comp_len	Length of comp data
+ *	uncomp_len	Output: length of uncompressed data
+ *
+ * Returns:
+ *	Uncompressed data if successful
+ *	NULL if not successful
+ */
+char *unrle(char *comp, int comp_len, int *uncomp_len);
+
+/*
+ * Mutli-byte run length encoding.
+ *
+ * Any run of 3 or more identical characters (up to 255 in a row) are replaced
+ * by a 'guard' byte followed by the number of characters followed by
+ * the character value itself.
+ * Any single guard value in the input is escaped using 'guard 0'.
+ *
+ * Specifying guard as -1 will automatically pick one of the least used
+ * characters in the input as the guard.
+ *
+ * Arguments:
+ *	uncomp		Input data
+ *	uncomp_len	Length of input data 'uncomp'
+ *	guard		Guard byte - used to encode "N" copies of data
+ *      rsz             Size of blocks to compare for run checking.
+ *	comp_len	Output: length of compressed data
+ *
+ * Returns:
+ *	Compressed data if successful
+ *	NULL if not successful
+ */
+char *xrle(char *uncomp, int uncomp_len, int guard, int rsz, int *comp_len);
+
+/*
+ * Reverses multi-byte run length encoding.
+ *
+ * Arguments:
+ *	comp		Compressed input data
+ *	comp_len	Length of comp data
+ *	uncomp_len	Output: length of uncompressed data
+ *
+ * Returns:
+ *	Uncompressed data if successful
+ *	NULL if not successful
+ */
+char *unxrle(char *comp, int comp_len, int *uncomp_len);
+
+/*
+ * Mutli-byte run length encoding.
+ *
+ * Steps along in words of size 'rsz'. Unlike XRLE above this does run-length
+ * encoding by writing out an additional "length" word every time 2 or more
+ * words in a row are spotted. This removes the need for a guard byte.
+ *
+ * Additionally this method ensures that both input and output formats remain
+ * aligned on words of size 'rsz'.
+ *
+ * Arguments:
+ *	uncomp		Input data
+ *	uncomp_len	Length of input data 'uncomp'
+ *      rsz             Size of blocks to compare for run checking.
+ *	comp_len	Output: length of compressed data
+ *
+ * Returns:
+ *	Compressed data if successful
+ *	NULL if not successful
+ */
+char *xrle2(char *uncomp, int uncomp_len, int rsz, int *comp_len);
+
+/*
+ * Reverses multi-byte run length encoding (xrle_new).
+ *
+ * Arguments:
+ *	comp		Compressed input data
+ *	comp_len	Length of comp data
+ *	uncomp_len	Output: length of uncompressed data
+ *
+ * Returns:
+ *	Uncompressed data if successful
+ *	NULL if not successful
+ */
+char *unxrle2(char *comp, int comp_len, int *uncomp_len);
+
+/*
+ * decorrelate1()
+ *
+ * Produce successive deltas from a 1-byte array.
+ *
+ * Arguments:
+ *	uncomp		Uncompressed data
+ *	uncomp_len	Length of uncompressed data
+ *	level		Differencing level (must be 1, 2 or 3)
+ *	comp_len	Return: where to store new compressed length
+ *
+ * Returns:
+ *	Success: A decorrelated buffer (malloced)
+ *	Failure: NULL
+ */
+char *decorrelate1(char *uncomp, int uncomp_len, int level, int *comp_len);
+char *decorrelate1dyn(char *s_uncomp, int uncomp_len, int *comp_len);
+
+/*
+ * recorrelate1()
+ *
+ * The reverse of decorrelate1()
+ *
+ * Arguments:
+ *	comp		Compressed input data
+ *	comp_len	Length of comp data
+ *	uncomp_len	Output: length of uncompressed data
+ *
+ * Returns:
+ *	Success: uncompressed data
+ *	Failure: NULL
+ */
+char *recorrelate1(char *comp, int comp_len, int *uncomp_len);
+
+/*
+ * decorrelate2()
+ *
+ * Produce successive deltas from a 2-byte array (big endian)
+ *
+ * Arguments:
+ *	uncomp		Uncompressed data
+ *	uncomp_len	Length of uncompressed data
+ *	level		Differencing level (must be 1, 2 or 3)
+ *	comp_len	Return: where to store new compressed length
+ *
+ * Returns:
+ *	Success: A decorrelated buffer (malloced)
+ *	Failure: NULL
+ */
+char *decorrelate2(char *uncomp, int uncomp_len, int level, int *comp_len);
+char *decorrelate2dyn(char *s_uncomp, int uncomp_len, int *comp_len);
+
+/*
+ * recorrelate2()
+ *
+ * The reverse of decorrelate2()
+ *
+ * Arguments:
+ *	comp		Compressed input data
+ *	comp_len	Length of comp data
+ *	uncomp_len	Output: length of uncompressed data
+ *
+ * Returns:
+ *	Success: uncompressed data
+ *	Failure: NULL
+ */
+char *recorrelate2(char *comp, int comp_len, int *uncomp_len);
+
+/*
+ * decorrelate4()
+ *
+ * Produce successive deltas from a 4-byte array (big endian)
+ *
+ * Arguments:
+ *	uncomp		Uncompressed data
+ *	uncomp_len	Length of uncompressed data
+ *	level		Differencing level (must be 1, 2 or 3)
+ *	comp_len	Return: where to store new compressed length
+ *
+ * Returns:
+ *	Success: A decorrelated buffer (malloced)
+ *	Failure: NULL
+ */
+char *decorrelate4(char *uncomp, int uncomp_len, int level, int *comp_len);
+
+/*
+ * recorrelate4()
+ *
+ * The reverse of decorrelate4()
+ *
+ * Arguments:
+ *	comp		Compressed input data
+ *	comp_len	Length of comp data
+ *	uncomp_len	Output: length of uncompressed data
+ *
+ * Returns:
+ *	Success: uncompressed data
+ *	Failure: NULL
+ */
+char *recorrelate4(char *comp, int comp_len, int *uncomp_len);
+
+/*
+ * shrink_16to8()
+ *
+ * Stores an array of 16-bit (big endian) array elements in an 8-bit array.
+ * We assume that most 16-bit elements encode numbers that fit in an 8-bit
+ * value. When not possible, we store a marker followed by the 16-bit value
+ * stored as multiple 8-bit values.
+ *
+ *	uncomp		Uncompressed data
+ *	uncomp_len	Length of uncompressed data (in bytes)
+ *	comp_len	Return: where to store new compressed length
+ *	
+ * Returns:
+ *	Success: An 8-bit array (malloced)
+ *	Failure: NULL
+ */
+char *shrink_16to8(char *uncomp, int uncomp_len, int *comp_len);
+
+/*
+ * expand_8to16()
+ *
+ * The opposite of the shrink_16to8() function.
+ *
+ *	comp		Compressed input data
+ *	comp_len	Length of comp data (in bytes)
+ *	uncomp_len	Output: length of uncompressed data (in bytes)
+ *	
+ * Returns:
+ *	Success: Uncompressed data (char *)
+ *	Failure: NULL
+ */
+char *expand_8to16(char *comp, int comp_len, int *uncomp_len);
+
+/*
+ * shrink_32to8()
+ *
+ * Stores an array of 32-bit (big endian) array elements in an 8-bit array.
+ * We assume that most 32-bit elements encode numbers that fit in an 8-bit
+ * value. When not possible, we store a marker followed by the 32-bit value
+ * stored as multiple 8-bit values.
+ *
+ *	uncomp		Uncompressed data
+ *	uncomp_len	Length of uncompressed data (in bytes)
+ *	comp_len	Return: where to store new compressed length
+ *	
+ * Returns:
+ *	Success: An 8-bit array (malloced)
+ *	Failure: NULL
+ */
+char *shrink_32to8(char *uncomp, int uncomp_len, int *comp_len);
+
+/*
+ * expand_8to32()
+ *
+ * The opposite of the shrink_32to8() function.
+ *
+ *	comp		Compressed input data
+ *	comp_len	Length of comp data (in bytes)
+ *	uncomp_len	Output: length of uncompressed data (in bytes)
+ *	
+ * Returns:
+ *	Success: Uncompressed data (char *)
+ *	Failure: NULL
+ */
+char *expand_8to32(char *comp, int comp_len, int *uncomp_len);
+
+char *follow1(char *s_uncomp,
+	      int uncomp_len,
+	      int *comp_len);
+
+char *unfollow1(char *s_comp,
+		int comp_len,
+		int *uncomp_len);
+
+char *ichebcomp(char *uncomp,
+		int uncomp_len,
+		int *data_len);
+
+char *ichebuncomp(char *comp,
+		  int comp_len,
+		  int *uncomp_len);
+
+/*
+ * This is a LOSSY compression. It replaces N with 10 * log2(N).
+ */
+char *log2_data(char *x_uncomp,
+		int uncomp_len,
+		int *comp_len);
+
+char *unlog2_data(char *x_comp,
+		  int comp_len,
+		  int *uncomp_len);
+
+/*
+ * Implements compression using a set of static huffman codes stored using
+ * the Deflate algorithm (and so in this respect it's similar to zlib).
+ *
+ * The huffman codes though can be previously stored in the ztr object
+ * using ztr_add_hcode(). "cset" indicates which numbered stored huffman
+ * code set is to be used, or passing zero will use inline codes (ie they
+ * are stored in the data stream itself, just as in standard deflate).
+ *
+ * Arguments:
+ *	ztr		ztr_t pointer; used to find stored code-sets
+ *	uncomp		The uncompressed input data
+ *	uncomp_len	Length of uncomp
+ *	cset		Stored code-set number, zero for inline
+ *	recsz		Record size - only used when cset == 0.
+ *	comp_len	Output: length of compressed data
+ *
+ * Returns:
+ *	Compressed data stream if successful + comp_len
+ *      NULL on failure
+ */
+char *sthuff(ztr_t *ztr, char *uncomp, int uncomp_len, 
+	     int cset, int recsz, int *comp_len);
+char *unsthuff(ztr_t *ztr, char *comp, int comp_len, int *uncomp_len);
+
+/*
+ * Reorders quality data from its RAW format to an interleaved 4-byte
+ * aligned format.
+ *
+ * Starting with sequence A1 C2 G3 the raw format is quality of called
+ * bases followed by quality of remaining bases:
+ * 0 (RAW format)
+ * Q(A1) Q(C2) Q(G3)
+ * Q(C2) Q(A2) Q(A3) 
+ * Q(G2) Q(G2) Q(C3) 
+ * Q(T2) Q(T2) Q(T3) 
+ *
+ * We reorder it to:
+ * ZTR_FORM_QSHIFT <any> <any> 0(raw)
+ * Q(A1) Q(C1) Q(G1) Q(T1)
+ * Q(C2) Q(A2) Q(G2) Q(T2)
+ * Q(G3) Q(A3) Q(C3) Q(T3)
+ * 
+ * Returns shifted data on success
+ *         NULL on failure
+ */
+char *qshift(char *qold, int qlen, int *new_len);
+char *unqshift(char *qold, int qlen, int *new_len);
+
+/*
+ * Given a sequence ACTG this shifts trace data from the order:
+ *
+ *     A1A2A3A4 C1C2C3C4 G1G2G3G4 T1T2T3T4
+ *
+ * to
+ *
+ *     A1C1G1T1 C2A2G2T2 T3A3C3G3 G4C4C4T4
+ *
+ * Ie for each base it ouputs the signal for the called base first
+ * followed by the remaining 3 signals in A,C,G,T order (minus the
+ * called signal already output).
+ */
+char *tshift(ztr_t *ztr, char *told_c, int tlen, int *new_len);
+char *untshift(ztr_t *ztr, char *told_c, int tlen, int *new_len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _COMPRESSION_H_ */