From: Robert Spalek <robert@ucw.cz>
Date: Fri, 11 Jun 2004 15:28:29 +0000 (+0000)
Subject: implemented fast compression and decompression routines
X-Git-Tag: holmes-import~1059
X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=d94fc019538bc9c96b694bf0cb1924fbd9251a6b;p=libucw.git

implemented fast compression and decompression routines

I have spent a lot of time on them and they just compile.  I have never
tested them.  The test program will follow soon.
---

diff --git a/lib/lizard.c b/lib/lizard.c
new file mode 100644
index 00000000..c71fc9ac
--- /dev/null
+++ b/lib/lizard.c
@@ -0,0 +1,491 @@
+/*
+ *	LiZzaRd -- Fast compression method based on Lempel-Ziv 77
+ *
+ *	(c) 2004, Robert Spalek <robert@ucw.cz>
+ *
+ *	The file format is based on LZO1X and 
+ *	the compression method is based on zlib.
+ */
+
+#include "lib/lib.h"
+#include "lib/lizzard.h"
+
+#include <string.h>
+
+typedef u16 hash_ptr_t;
+struct hash_record {
+  byte *string;				// in original text
+  hash_ptr_t next4, next3;		// 0=end
+};
+
+#define	HASH_SIZE	((1<<14) + 27)	// prime, size of hash-table
+#define	HASH_RECORDS	((1<<15) - 1)	// maximum number of records in hash-table
+#define	CHAIN_MAX_TESTS	16		// crop longer collision chains
+#define	CHAIN_GOOD_MATCH	16	// we already have a good match => end
+
+static inline uns
+hashf(byte *string)
+{
+    uns hash;
+#ifdef	CPU_ALLOW_UNALIGNED
+    hash = * (uns *) string;
+#elif	CPU_LITTLE_ENDIAN
+    hash = string[0] | (string[1]<<8) | (string[2]<<16) | (string[3]<<24);
+#else
+    hash = string[3] | (string[2]<<8) | (string[1]<<16) | (string[0]<<24);
+#endif
+    return hash;
+}
+#ifdef	CPU_LITTLE_ENDIAN
+#define	MASK_4TH_CHAR	(~(0xff << 24))
+#else
+#define	MASK_4TH_CHAR	(~0xff)
+#endif
+
+static inline uns
+find_match(uns list_id, hash_ptr_t *hash_tab, struct hash_record *hash_rec, uns hash, byte *string, byte *string_end, byte **best_ptr)
+  /* We heavily rely on gcc optimisations (e.g. list_id is a constant hence
+   * some branches of the code will be pruned).
+   *
+   * hash_tab[hash] points to the 1st record of the link-list of strings with the
+   * same hash.  The records are statically stored in circular array hash_rec
+   * (with 1st entry unused), and the pointers are just 16-bit indices.  In one
+   * array, we store 2 independent link-lists: with 3 and 4 hashed characters.
+   *
+   * Updates are performed more often than searches, so they are lazy: we only
+   * add new records in front of link-lists and never delete them.  By reusing
+   * an already allocated space in the circular array we necessarily modify
+   * records that are already pointed at in the middle of some link-list.  To
+   * prevent confusion, this function checks 2 invariants: the strings in a
+   * collision chain are ordered, and the hash of the 1st record in the chain
+   * must match.  */
+{
+  if (list_id == 3)				/* Find the 1st record in the collision chain.  */
+    hash = hash & MASK_4TH_CHAR;
+  hash %= HASH_SIZE;
+  if (!hash_tab[hash])
+    return 0;
+  struct hash_record *record = hash_rec + hash_tab[hash];
+  uns hash2 = hashf(record->string);
+  if (list_id == 3)
+    hash2 = hash2 & MASK_4TH_CHAR;
+  hash2 %= HASH_SIZE;
+  if (hash2 != hash)				/* Verify the hash-value.  */
+  {
+    hash_tab[hash] = 0;				/* prune */
+    return 0;
+  }
+
+  uns count = CHAIN_MAX_TESTS;
+  uns best_len = 0;
+  byte *last_string = string_end;
+  hash_ptr_t *last_ptr = NULL;
+  while (count-- > 0)
+  {
+    byte *cmp = record->string;
+    if (cmp >= last_string)			/* collision chain is ordered by age */
+    {
+      *last_ptr = 0;				/* prune */
+      break;
+    }
+    if (cmp[0] == string[0] && cmp[2] == string[2])
+    {
+      if (cmp[3] == string[3])
+      {
+	/* implies cmp[1] == string[1], becase we hash into 14+eps bits */
+	cmp += 4;
+	if (*cmp++ == string[4] && *cmp++ == string[5]
+	    && *cmp++ == string[6] && *cmp++ == string[7])
+	{
+	  byte *str = string + 8;
+	  while (str <= string_end && *cmp++ == *string++);
+	}
+      }
+      else if (list_id == 3)
+	/* hashing 3 characters implies cmp[1] == string[1]
+	   hashing 4 characters implies cmp[1] != string[1] */
+	cmp += 4;
+      else
+	goto next_collision;
+      uns len = cmp - record->string - 1;	/* cmp points 2 characters after the last match */
+      if (len > best_len)
+      {
+	best_len = len;
+	*best_ptr = record->string;
+	if (best_len >= CHAIN_GOOD_MATCH)	/* optimisation */
+	  break;
+      }
+    }
+next_collision:
+    if (list_id == 3)
+    {
+      if (!record->next3)
+	break;
+      last_ptr = &record->next3;
+      record = hash_rec + record->next3;
+    }
+    else
+    {
+      if (!record->next4)
+	break;
+      last_ptr = &record->next4;
+      record = hash_rec + record->next4;
+    }
+  }
+  return best_len;
+}
+
+static inline uns
+hash_string(byte *string, uns hash, hash_ptr_t *tab3, hash_ptr_t *tab4, struct hash_record *hash_rec, uns head)
+{
+  uns h3 = (hash & MASK_4TH_CHAR) % HASH_SIZE;
+  uns h4 = hash % HASH_SIZE;
+  struct hash_record *rec = hash_rec + head;
+  rec->string = string;
+  rec->next3 = tab3[h3];
+  rec->next4 = tab4[h4];
+  tab3[h3] = head;				/* add the new record before the link-list */
+  tab4[h4] = head;
+  head++;					/* circular buffer, reuse old records */
+  if (head >= HASH_RECORDS)
+    head = 0;
+  return head;
+}
+
+static inline byte *
+dump_unary_value(byte *out, uns l)
+{
+  while (l > 255)
+  {
+    l -= 255;
+    *out++ = 0;
+  }
+  *out++ = l;
+  return out;
+}
+
+static byte *
+flush_copy_command(uns bof, byte *out, byte *start, uns len)
+{
+  if (bof && len <= 238)
+    *out++ = len + 17;
+  else if (len < 4)
+    /* cannot happen when !!bof */
+    out[-2] |= len;				/* invariant: lowest 2 bits 2 bytes back */
+  else
+  {
+    /* leave 2 least significant bits of out[-2] set to 0 */
+    if (len <= 18)
+      *out++ = len - 3;
+    else
+    {
+      *out++ = 0;
+      out = dump_unary_value(out, len - 18);
+    }
+  }
+  while (len-- > 0)
+    *out++ = *start++;
+  return out;
+}
+
+int
+lizzard_compress(byte *in, uns in_len, byte *out)
+  /* Requires out being allocated for at least in_len * LIZZARD_MAX_PROLONG_FACTOR.
+   * There must be at least 8 characters allocated after in.
+   * Returns the actual compressed length. */
+{
+  hash_ptr_t hash_tab3[HASH_SIZE], hash_tab4[HASH_SIZE];
+  struct hash_record hash_rec[HASH_RECORDS];
+  byte *in_start = in;
+  byte *in_end = in + in_len;
+  byte *out_start = out;
+  byte *copy_start = in;
+  uns head = 0;
+  bzero(hash_tab3, sizeof(hash_tab3));		/* init the hash-table */
+  bzero(hash_tab4, sizeof(hash_tab4));
+  while (in < in_end)
+  {
+    uns hash = hashf(in);
+    byte *best;
+    uns len = find_match(4, hash_tab4, hash_rec, hash, in, in_end, &best);
+    if (len >= 3)
+      goto match;
+    len = find_match(3, hash_tab3, hash_rec, hash, in, in_end, &best);
+
+match_found:
+    if (len >= 3)
+      goto match;
+#if 0			// TODO: now, our routine does not detect matches of length 2
+    if (len == 2 && (in - best->string) < ((1<<10) + 1))
+      goto match;
+#endif
+literal:
+    head = hash_string(in, hash, hash_tab3, hash_tab4, hash_rec, head);
+    in++;					/* add a literal */
+    continue;
+
+match:
+    if (in + len > in_end)			/* crop EOF */
+    {
+      len = in_end - in;
+      if (len < 3)
+	goto match_found;
+    }
+    /* Record the match.  */
+    uns copy_len = in - copy_start;
+    uns is_in_copy_mode = copy_start==in_start || copy_len >= 4;
+    uns shift = in - best - 1;
+    /* Try to use a 2-byte sequence.  */
+    if (len == 2)
+    {
+      if (is_in_copy_mode || !copy_len)		/* cannot use with 0 copied characters, because this bit pattern is reserved for copy mode */
+	goto literal;
+
+dump_2sequence:
+      if (copy_len > 0)
+	out = flush_copy_command(copy_start==in_start, out, copy_start, copy_len);
+      *out++ = (shift>>6) & ~3;			/* shift fits into 10 bits */
+      *out++ = shift & 0xff;
+    }
+    else if (len == 3 && is_in_copy_mode && shift >= (1<<11) && shift < (1<<11 + 1<<10))
+    {
+      /* optimisation for length-3 matches after a copy command */
+      shift -= 1<<11;
+      goto dump_2sequence;
+    }
+    /* now, len >= 3 */
+    else if (shift < (1<<11) && len <= 8)
+    {
+      shift |= (len-3 + 2)<<11;
+      goto dump_2sequence;			/* shift has 11 bits and contains also len */
+    }
+    /* We have to use a 3-byte sequence.  */
+    else if (len == 3 && is_in_copy_mode)
+      /* avoid 3-sequence compressed to 3 sequence if it can simply be appended */
+      goto literal;
+    else
+    {
+      if (copy_len > 0)
+	out = flush_copy_command(copy_start==in_start, out, copy_start, copy_len);
+      if (shift < (1<<14))
+      {
+	if (len <= 33)
+	  *out++ = (1<<5) | (len-2);
+	else
+	{
+	  *out++ = 1<<5;
+	  out = dump_unary_value(out, len - 33);
+	}
+      }
+      else /* shift < (1<<15)-1 becase of HASH_RECORDS */
+      {
+	shift++;				/* because shift==0 is reserved for EOF */
+	byte pos_bit = (shift>>11) & (1<<3);
+	if (len <= 9)
+	  *out++ = (1<<4) | pos_bit | (len-2);
+	else
+	{
+	  *out++ = (1<<4) | pos_bit;
+	  out = dump_unary_value(out, len - 9);
+	}
+      }
+      *out++ = (shift>>6) & ~3;			/* rest of shift fits into 14 bits */
+      *out++ = shift & 0xff;
+    }
+    /* Update the hash-table.  */
+    head = hash_string(in, hash, hash_tab3, hash_tab4, hash_rec, head);
+    for (uns i=1; i<len; i++)
+      head = hash_string(in+i, hashf(in+i), hash_tab3, hash_tab4, hash_rec, head);
+    in += len;
+    copy_start = in;
+  }
+  if (in > in_end)				/* crop at BOF */
+    in = in_end;
+  uns copy_len = in - copy_start;
+  if (copy_len > 0)
+    out = flush_copy_command(copy_start==in_start, out, copy_start, copy_len);
+  *out++ = 17;					/* add EOF */
+  *out++ = 0;
+  *out++ = 0;
+  return out - out_start;
+}
+
+static inline byte *
+read_unary_value(byte *in, uns *val)
+{
+  uns l = 0;
+  while (!*in++)
+    l += 255;
+  l += in[-1];
+  *val = l;
+  return in;
+}
+
+int
+lizzard_decompress(byte *in, byte *out)
+  /* Requires out being allocated for the decompressed length must be known
+   * beforehand.  It is desirable to lock the following memory page for
+   * read-only access to prevent buffer overflow.  Returns the actual
+   * decompressed length or a negative number when an error has occured.  */
+{
+  byte *out_start = out;
+  uns expect_copy_command = 1;
+  uns len;
+  if (*in > 17)					/* short copy command at BOF */
+  {
+    len = *in++ - 17;
+    expect_copy_command = 2;
+    goto perform_copy_command;
+  }
+  while (1)
+  {
+    uns c = *in++;
+    uns pos;
+    if (c < 0x10)
+      if (expect_copy_command == 1)
+      {
+	if (!c)
+	  in = read_unary_value(in, &len);
+	else
+	  len = c;
+	expect_copy_command = 2;
+	goto perform_copy_command;
+      }
+      else
+      {
+	pos = (c&0xc)<<6 | *in++;
+	if (expect_copy_command == 2)
+	{
+	  pos += 1<<11;
+	  len = 3;
+	}
+	else
+	  len = 2;
+      }
+    else if (c < 0x20)
+    {
+      pos = (c&0x8)<<11;
+      len = c&0x7;
+      if (len)
+      {
+	in = read_unary_value(in, &len);
+	len += 9;
+      }
+      else
+	len += 2;
+      pos |= (*in++ & 0xfc)<<6;
+      pos |= *in++;
+      if (!pos)					/* EOF */
+	break;
+      else					/* shift it back */
+	pos--;
+    }
+    else if (c < 0x40)
+    {
+      len = c&0x1f;
+      if (len)
+      {
+	in = read_unary_value(in, &len);
+	len += 33;
+      }
+      else
+	len += 2;
+      pos = (*in++ & 0xfc)<<6;
+      pos |= *in++;
+    }
+    else /* high bits encode the length */
+    {
+      len = (c&0xe)>>5 -2 +3;
+      pos = (c&0x1c)<<6;
+      pos |= *in++;
+    }
+    /* take from the sliding window */
+    memcpy(out, in-1-pos, len);			//FIXME: overlapping
+    out += len;
+    /* extract the copy-bits */
+    len = in[-2] & 0x3;
+    if (len)
+      expect_copy_command = 0;			/* and fall-thru */
+    else
+    {
+      expect_copy_command = 1;
+      continue;
+    }
+
+perform_copy_command:
+    memcpy(out, in, len);
+    out += len;
+    in += len;
+  }
+
+  return out - out_start;
+}
+
+/*
+
+Description of the LZO1X format :
+=================================
+
+Beginning of file:
+------------------
+
+If the first byte is 00010001, it means probably EOF (empty file), so switch
+to the compressed mode.  If it is bigger, subtract 17 and copy this number of
+the following characters to the ouput and switch to the compressed mode.  If
+it is smaller, go to the copy mode.
+
+Compressed mode :
+-----------------
+
+Read the first byte of the sequence and determine the type of bit-encoding by
+looking at the most significant bits.  The sequence is always at least 2 bytes
+long.  Decode sequences of these types until the EOF or END marker is read.
+
+  length L = length of the text taken from the sliding window
+
+    If L=0, then count the number Z of the following zero bytes and add Z*255
+    to the value of the following non-zero byte.  This allows setting L
+    arbitrarily high.
+
+  position p = relative position of the beginning of the text
+
+    Exception: 00010001 00000000 00000000 means EOF
+
+  copying C = length 1..3 of copied characters or END=0
+
+    C following characters will be copied from the compressed text to the
+    output.  The number CC is always stored in the 2 least significant bits of
+    the second last byte of the sequence.
+    
+    If END is read, the algorithm switches to the copy mode.
+
+pattern					length		position
+
+0000ppCC 		 pppppppp	2		10 bits (*)
+0001pLLL L*	ppppppCC pppppppp	3..9 + extend	15 bits + EOF
+001LLLLL L*	ppppppCC pppppppp	3..33 + extend	14 bits
+01\
+10 \
+11  \
+LLLpppCC		 pppppppp	3..8		11 bits
+
+Copy mode :
+-----------
+
+Read the first byte and, if the most significant bits are 0000, perform the
+following command, otherwise switch to the compressed mode (and evaluate the
+command).
+
+pattern					length		position
+
+0000LLLL L*				4..18 + extend	N/A
+
+  Copy L characters from the compressed text to the output.  The overhead for
+  incompressible strings is only roughly 1/256 + epsilon.
+
+(*) After reading one copy command, switch to the compressed mode with the
+following "optimisation": the pattern 0000ppCC expands to length 3 instead of 2
+and 2048 is added to the position (now it is slightly more than 11 bits),
+because a sequence of length 2 would never be used.
+
+*/
diff --git a/lib/lizard.h b/lib/lizard.h
new file mode 100644
index 00000000..656aab8d
--- /dev/null
+++ b/lib/lizard.h
@@ -0,0 +1,16 @@
+/*
+ *	LiZzaRd -- Fast compression method based on Lempel-Ziv 77
+ *
+ *	(c) 2004, Robert Spalek <robert@ucw.cz>
+ *
+ *	The file format is based on LZO1X and 
+ *	the compression method is based on zlib.
+ */
+
+#define	LIZZARD_MAX_PROLONG_FACTOR	129/128
+  /* Incompressible string of length 256 has a 2-byte header.
+   * Hence the scheme the file length get multiplied by 129/128 in the worst
+   * case + at most 4 bytes are added for header and EOF.  */
+
+int lizzard_compress(byte *in, uns in_len, byte *out);
+int lizzard_decompress(byte *in, byte *out);