ucw docs: The lizard compression algorithm

author Michal Vaner <vorner@ucw.cz>

Thu, 27 Nov 2008 16:19:17 +0000 (17:19 +0100)

committer Michal Vaner <vorner@ucw.cz>

Thu, 27 Nov 2008 16:19:17 +0000 (17:19 +0100)
author Michal Vaner <vorner@ucw.cz>
Thu, 27 Nov 2008 16:19:17 +0000 (17:19 +0100)
committer Michal Vaner <vorner@ucw.cz>
Thu, 27 Nov 2008 16:19:17 +0000 (17:19 +0100)
diff --git a/ucw/doc/Makefile b/ucw/doc/Makefile

index c14ff1ec90b4c4b2959c19c225fd15000dfbdefd..ecb22d5c046006dec925acf7056c4a94a839452c 100644 (file)
--- a/ucw/doc/Makefile
+++ b/ucw/doc/Makefile
@@ -2,7 +2,7 @@
  
  DIRS+=ucw/doc
  
-UCW_DOCS=fastbuf index config configure install basecode hash docsys conf mempool eltpool mainloop generic growbuf unaligned lists chartype unicode prime binsearch heap binheap
+UCW_DOCS=fastbuf index config configure install basecode hash docsys conf mempool eltpool mainloop generic growbuf unaligned lists chartype unicode prime binsearch heap binheap compress
  UCW_INDEX=$(o)/ucw/doc/def_index.html
  UCW_DOCS_HTML=$(addprefix $(o)/ucw/doc/,$(addsuffix .html,$(UCW_DOCS)))
  
diff --git a/ucw/doc/compress.txt b/ucw/doc/compress.txt

new file mode 100644 (file)

index 0000000..7e95099
--- /dev/null
+++ b/ucw/doc/compress.txt
@@ -0,0 +1,15 @@
+Compression
+===========
+
+The library contains a compression routine, called LiZaRd.  It is
+modified Lempel-Ziv 77 method with slightly worse compression ratio,
+but with faster compression and decompression.
+
+// TODO Meaning of the abbreviation
+// TODO Actual numbers how fast it is?
+
+- <<basic,Basic application>>
+- <<safe,Safe decompression>>
+- <<adler,Adler-32 checksum>>
+
+!!ucw/lizard.h
diff --git a/ucw/doc/hash.txt b/ucw/doc/hash.txt

index bcdbb7e389ba19c6bddaa533a2e496ff69eebac3..3e01b94156c23de6d9a79f7732f7992954787a89 100644 (file)
--- a/ucw/doc/hash.txt
+++ b/ucw/doc/hash.txt
@@ -13,6 +13,9 @@ There are non-cryptographic hashes as well.
  - <<sha1:sha1_hmac(),SHA1_HMAC>>
  - <<usage,Common usage>>
  
+<<checksum,Checksums>>:
+- <<crypto:adler,Adler-32>>
+
  <<nocrypto,Non-cryptographic ones>>:
  
  - <<strhash,String & block hashes>>
@@ -63,10 +66,22 @@ SHA1 has the same interface, so the same two ways apply.
  
  See also <<string:mem_to_hex()>>.
  
+[[checksum]]
+Checksums
+---------
+
+Their purpose is checking against random data changes, hardware
+failures and alike. They are not to be used against aimed attacks.
+
+The <<compress:adler,Adler-32 checksum>> is documented in the
+<<compression,compression capter>>.
+
  [[nocrypto]]
  Non-cryptographic hashes
  ------------------------
  
+They are usually used to identify values in hash tables.
+
  All these functions expect to be moduled by the size of a hash table.
  The size should be a prime number (it gives better distribution).
  
diff --git a/ucw/doc/index.txt b/ucw/doc/index.txt

index ab7dce96c81cc72a8c071bb796398617ddabe763..69823cd15ddf4339a31f266e3a2a52b7777820e2 100644 (file)
--- a/ucw/doc/index.txt
+++ b/ucw/doc/index.txt
@@ -30,6 +30,7 @@ Modules
  - <<binsearch:,Binary search>>
  - <<heap:,Binary heaps>>
  - <<binheap:,Binomial heaps>>
+- <<compress:,Compression>>
  
  Other features
  --------------
@@ -69,8 +70,6 @@ Yet undocumented modules
    * `ipaccess.h`
  - Prefetching of memory
    * `prefetch.h`
-- Compression
-  * `lizard.h`
  - Caches
    * `qache.h`
  - Threads
diff --git a/ucw/lizard.h b/ucw/lizard.h

index 547f181dc2af3fc813f7d1ebdf3bb8b3109b39aa..f45207251c9d73feb4e7af3bcb02fc50962ee6d6 100644 (file)
--- a/ucw/lizard.h
+++ b/ucw/lizard.h
@@ -10,9 +10,18 @@
  #ifndef _UCW_LIZARD_H
  #define _UCW_LIZARD_H
  
+/***
+ * [[basic]]
+ * Basic application
+ * -----------------
+ **/
+
+/**
+ * The compression routine needs input buffer 8 characters longer, because it
+ * does not check the input bounds all the time.
+ **/
  #define        LIZARD_NEEDS_CHARS      8
-  /* The compression routine needs input buffer 8 characters longer, because it
-   * does not check the input bounds all the time.  */
+
  #define        LIZARD_MAX_MULTIPLY     23./22
  #define        LIZARD_MAX_ADD          4
    /* In the worst case, the compressed file will not be longer than its
@@ -26,22 +35,95 @@
     * total length is 2(header) + 19(string) + 2(link) = 23.
     */
  
+/**
+ * The compressed data will not be longer than `LIZARD_MAX_LEN(input_length)`.
+ * Note that `LIZARD_MAX_LEN(length) > length` (this is not a problem of the algorithm,
+ * every lossless compression algorithm must have an input for which it produces a larger
+ * output).
+ *
+ * Use this to compute the size of @out paramater of @lizard_compress().
+ **/
+#define LIZARD_MAX_LEN(LENGTH) ((LENGTH) * LIZARD_MAX_MULTIPLY + LIZARD_MAX_ADD)
+
  /* lizard.c */
+
+/**
+ * Compress data provided in @in.
+ * The input buffer must be at last `@in_len + <<def_LIZARD_NEEDS_CHARS,LIZARD_NEEDS_CHARS>>`
+ * long (the compression algorithm does not check the bounds all the time).
+ *
+ * The output will be stored in @out. The @out buffer must be at last <<def_LIZARD_LEN,`LIZARD_LEN(@in_len)`>>
+ * bytes long for the output to fit in for sure.
+ *
+ * The function returns number of bytes actually needed (the size of output).
+ *
+ * Use @lizard_decompress() to get the original data.
+ **/
  int lizard_compress(const byte *in, uns in_len, byte *out);
+
+/**
+ * Decompress data previously compressed by @lizard_compress().
+ * Input is taken from @in and the result stored in @out.
+ * The size of output is returned.
+ *
+ * Note that you need to know the maximal possible size of the output to
+ * allocate enough memory.
+ *
+ * See also <<safe,safe decompression>>.
+ **/
  int lizard_decompress(const byte *in, byte *out);
  
  /* lizard-safe.c */
-struct lizard_buffer;
  
-struct lizard_buffer *lizard_alloc(void);
-void lizard_free(struct lizard_buffer *buf);
+/***
+ * [[safe]]
+ * Safe decompression
+ * ------------------
+ *
+ * You can use safe decompression, when you want to make sure you got the
+ * length right and when you want to reuse the buffer for output.
+ ***/
+
+struct lizard_buffer;  /** Type of the output buffer for @lizard_decompress_safe(). **/
+
+struct lizard_buffer *lizard_alloc(void);      /** Get me a new <<struct_lizard_buffer,`lizard_buffer`>>. **/
+void lizard_free(struct lizard_buffer *buf);   /** Return memory used by a <<struct_lizard_buffer,`lizard_buffer`>>. **/
+
+/**
+ * Decompress data previously compressed by @lizard_compress().
+ * Input is taken from @in. @buf is used to store the output.
+ * You need to provide the length of the uncompressed data in @expected_length.
+ *
+ * The pointer to data is returned.
+ *
+ * If an error occurs, NULL is returned and `errno` is set.
+ * `EINVAL` means the actual length does not match @expected_length.
+ * `EFAULT` means a segfault was encountered while decompressing (probably @expected_length was way too low).
+ **/
  byte *lizard_decompress_safe(const byte *in, struct lizard_buffer *buf, uns expected_length);
  
  /* adler32.c */
+
+/***
+ * [[adler]]
+ * Adler-32 checksum
+ * -----------------
+ *
+ * This is here because it is commonly used to check data compressed by LiZaRd.
+ * However, it could also belong to <<hash,hashing routines>>.
+ ***/
+
+/**
+ * Update the Adler-32 checksum with more data.
+ * @adler is the old value, @byte points to @len bytes of data to update with.
+ * Result is returned.
+ **/
  uns adler32_update(uns adler, const byte *ptr, uns len);
  
-static inline uns
-adler32(const byte *buf, uns len)
+/**
+ * Compute the Adler-32 checksum of a block of data.
+ **/
+static inline uns adler32(const byte *buf, uns len)
  {
    return adler32_update(1, buf, len);
  }
author	Michal Vaner <vorner@ucw.cz>
	Thu, 27 Nov 2008 16:19:17 +0000 (17:19 +0100)
committer	Michal Vaner <vorner@ucw.cz>
	Thu, 27 Nov 2008 16:19:17 +0000 (17:19 +0100)
ucw/doc/Makefile		patch \| blob \| history
ucw/doc/compress.txt	[new file with mode: 0644]	patch \| blob
ucw/doc/hash.txt		patch \| blob \| history
ucw/doc/index.txt		patch \| blob \| history
ucw/lizard.h		patch \| blob \| history