From: Michal Vaner Date: Thu, 27 Nov 2008 16:19:17 +0000 (+0100) Subject: ucw docs: The lizard compression algorithm X-Git-Tag: holmes-import~140 X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=fdb438738b41d339e87aaa0c93620a45660ffa5d;p=libucw.git ucw docs: The lizard compression algorithm --- diff --git a/ucw/doc/Makefile b/ucw/doc/Makefile index c14ff1ec..ecb22d5c 100644 --- a/ucw/doc/Makefile +++ b/ucw/doc/Makefile @@ -2,7 +2,7 @@ DIRS+=ucw/doc -UCW_DOCS=fastbuf index config configure install basecode hash docsys conf mempool eltpool mainloop generic growbuf unaligned lists chartype unicode prime binsearch heap binheap +UCW_DOCS=fastbuf index config configure install basecode hash docsys conf mempool eltpool mainloop generic growbuf unaligned lists chartype unicode prime binsearch heap binheap compress UCW_INDEX=$(o)/ucw/doc/def_index.html UCW_DOCS_HTML=$(addprefix $(o)/ucw/doc/,$(addsuffix .html,$(UCW_DOCS))) diff --git a/ucw/doc/compress.txt b/ucw/doc/compress.txt new file mode 100644 index 00000000..7e950997 --- /dev/null +++ b/ucw/doc/compress.txt @@ -0,0 +1,15 @@ +Compression +=========== + +The library contains a compression routine, called LiZaRd. It is +modified Lempel-Ziv 77 method with slightly worse compression ratio, +but with faster compression and decompression. + +// TODO Meaning of the abbreviation +// TODO Actual numbers how fast it is? + +- <> +- <> +- <> + +!!ucw/lizard.h diff --git a/ucw/doc/hash.txt b/ucw/doc/hash.txt index bcdbb7e3..3e01b941 100644 --- a/ucw/doc/hash.txt +++ b/ucw/doc/hash.txt @@ -13,6 +13,9 @@ There are non-cryptographic hashes as well. - <> - <> +<>: +- <> + <>: - <> @@ -63,10 +66,22 @@ SHA1 has the same interface, so the same two ways apply. See also <>. +[[checksum]] +Checksums +--------- + +Their purpose is checking against random data changes, hardware +failures and alike. They are not to be used against aimed attacks. + +The <> is documented in the +<>. + [[nocrypto]] Non-cryptographic hashes ------------------------ +They are usually used to identify values in hash tables. + All these functions expect to be moduled by the size of a hash table. The size should be a prime number (it gives better distribution). diff --git a/ucw/doc/index.txt b/ucw/doc/index.txt index ab7dce96..69823cd1 100644 --- a/ucw/doc/index.txt +++ b/ucw/doc/index.txt @@ -30,6 +30,7 @@ Modules - <> - <> - <> +- <> Other features -------------- @@ -69,8 +70,6 @@ Yet undocumented modules * `ipaccess.h` - Prefetching of memory * `prefetch.h` -- Compression - * `lizard.h` - Caches * `qache.h` - Threads diff --git a/ucw/lizard.h b/ucw/lizard.h index 547f181d..f4520725 100644 --- a/ucw/lizard.h +++ b/ucw/lizard.h @@ -10,9 +10,18 @@ #ifndef _UCW_LIZARD_H #define _UCW_LIZARD_H +/*** + * [[basic]] + * Basic application + * ----------------- + **/ + +/** + * The compression routine needs input buffer 8 characters longer, because it + * does not check the input bounds all the time. + **/ #define LIZARD_NEEDS_CHARS 8 - /* The compression routine needs input buffer 8 characters longer, because it - * does not check the input bounds all the time. */ + #define LIZARD_MAX_MULTIPLY 23./22 #define LIZARD_MAX_ADD 4 /* In the worst case, the compressed file will not be longer than its @@ -26,22 +35,95 @@ * total length is 2(header) + 19(string) + 2(link) = 23. */ +/** + * The compressed data will not be longer than `LIZARD_MAX_LEN(input_length)`. + * Note that `LIZARD_MAX_LEN(length) > length` (this is not a problem of the algorithm, + * every lossless compression algorithm must have an input for which it produces a larger + * output). + * + * Use this to compute the size of @out paramater of @lizard_compress(). + **/ +#define LIZARD_MAX_LEN(LENGTH) ((LENGTH) * LIZARD_MAX_MULTIPLY + LIZARD_MAX_ADD) + /* lizard.c */ + +/** + * Compress data provided in @in. + * The input buffer must be at last `@in_len + <>` + * long (the compression algorithm does not check the bounds all the time). + * + * The output will be stored in @out. The @out buffer must be at last <> + * bytes long for the output to fit in for sure. + * + * The function returns number of bytes actually needed (the size of output). + * + * Use @lizard_decompress() to get the original data. + **/ int lizard_compress(const byte *in, uns in_len, byte *out); + +/** + * Decompress data previously compressed by @lizard_compress(). + * Input is taken from @in and the result stored in @out. + * The size of output is returned. + * + * Note that you need to know the maximal possible size of the output to + * allocate enough memory. + * + * See also <>. + **/ int lizard_decompress(const byte *in, byte *out); /* lizard-safe.c */ -struct lizard_buffer; -struct lizard_buffer *lizard_alloc(void); -void lizard_free(struct lizard_buffer *buf); +/*** + * [[safe]] + * Safe decompression + * ------------------ + * + * You can use safe decompression, when you want to make sure you got the + * length right and when you want to reuse the buffer for output. + ***/ + +struct lizard_buffer; /** Type of the output buffer for @lizard_decompress_safe(). **/ + +struct lizard_buffer *lizard_alloc(void); /** Get me a new <>. **/ +void lizard_free(struct lizard_buffer *buf); /** Return memory used by a <>. **/ + +/** + * Decompress data previously compressed by @lizard_compress(). + * Input is taken from @in. @buf is used to store the output. + * You need to provide the length of the uncompressed data in @expected_length. + * + * The pointer to data is returned. + * + * If an error occurs, NULL is returned and `errno` is set. + * `EINVAL` means the actual length does not match @expected_length. + * `EFAULT` means a segfault was encountered while decompressing (probably @expected_length was way too low). + **/ byte *lizard_decompress_safe(const byte *in, struct lizard_buffer *buf, uns expected_length); /* adler32.c */ + +/*** + * [[adler]] + * Adler-32 checksum + * ----------------- + * + * This is here because it is commonly used to check data compressed by LiZaRd. + * However, it could also belong to <>. + ***/ + +/** + * Update the Adler-32 checksum with more data. + * @adler is the old value, @byte points to @len bytes of data to update with. + * Result is returned. + **/ uns adler32_update(uns adler, const byte *ptr, uns len); -static inline uns -adler32(const byte *buf, uns len) +/** + * Compute the Adler-32 checksum of a block of data. + **/ +static inline uns adler32(const byte *buf, uns len) { return adler32_update(1, buf, len); }