ucw/lizard.c

   1 /*
   2  *      LiZaRd -- Fast compression method based on Lempel-Ziv 77
   3  *
   4  *      (c) 2004, Robert Spalek <robert@ucw.cz>
   5  *
   6  *      This software may be freely distributed and used according to the terms
   7  *      of the GNU Lesser General Public License.
   8  *
   9  *      The file format is based on LZO1X and
  10  *      the compression method is based on zlib.
  11  */
  12
  13 #include <ucw/lib.h>
  14 #include <ucw/lizard.h>
  15
  16 #include <string.h>
  17
  18 typedef u16 hash_ptr_t;
  19 struct hash_record {
  20   /* the position in the original text is implicit; it is computed by locate_string() */
  21   hash_ptr_t next;                      // 0=end
  22   hash_ptr_t prev;                      // high bit: 0=record in array, 1=head in hash-table (i.e. value of hashf)
  23 };
  24
  25 #define HASH_SIZE       (1<<14)         // size of hash-table
  26 #define HASH_RECORDS    (1<<15)         // maximum number of records in hash-table, 0 is unused ==> subtract 1
  27 #define CHAIN_MAX_TESTS         8       // crop longer collision chains
  28 #define CHAIN_GOOD_MATCH        32      // we already have a good match => end
  29
  30 static inline uint
  31 hashf(const byte *string)
  32   /* 0..HASH_SIZE-1 */
  33 {
  34     return string[0] ^ (string[1]<<3) ^ (string[2]<<6);
  35 }
  36
  37 static inline byte *
  38 locate_string(const byte *string, int record_id, int head)
  39   /* The strings are recorded into the hash-table regularly, hence there is no
  40    * need to store the pointer there.  */
  41 {
  42   string += record_id - head;
  43   if (record_id >= head)
  44     string -= HASH_RECORDS-1;
  45   return (byte *)string;
  46 }
  47
  48 static inline uint
  49 find_match(uint record_id, struct hash_record *hash_rec, const byte *string, const byte *string_end, byte **best_ptr, uint head)
  50   /* hash_tab[hash] == record_id points to the head of the double-linked
  51    * link-list of strings with the same hash.  The records are statically
  52    * stored in circular array hash_rec (with the 1st entry unused), and the
  53    * pointers are just 16-bit indices.  The strings in every collision chain
  54    * are ordered by age.  */
  55 {
  56   uint count = CHAIN_MAX_TESTS;
  57   uint best_len = 0;
  58   while (record_id && count-- > 0)
  59   {
  60     byte *record_string = locate_string(string, record_id, head);
  61     byte *cmp = record_string;
  62     if (cmp[0] == string[0] && cmp[2] == string[2])
  63     /* implies cmp[1] == string[1] */
  64     {
  65       if (cmp[3] == string[3])
  66       {
  67         cmp += 4;
  68         if (*cmp++ == string[4] && *cmp++ == string[5]
  69             && *cmp++ == string[6] && *cmp++ == string[7])
  70         {
  71           const byte *str = string + 8;
  72           while (str <= string_end && *cmp++ == *str++);
  73         }
  74       }
  75       else
  76         cmp += 4;
  77       uint len = cmp - record_string - 1;       /* cmp points 2 characters after the last match */
  78       if (len > best_len)
  79       {
  80         best_len = len;
  81         *best_ptr = record_string;
  82         if (best_len >= CHAIN_GOOD_MATCH)       /* optimization */
  83           break;
  84       }
  85     }
  86     record_id = hash_rec[record_id].next;
  87   }
  88   return best_len;
  89 }
  90
  91 static uint
  92 hash_string(hash_ptr_t *hash_tab, uint hash, struct hash_record *hash_rec, /*byte *string,*/ uint head, uint *to_delete)
  93   /* We reuse hash-records stored in a circular array.  First, delete the old
  94    * one and then add the new one in front of the link-list.  */
  95 {
  96   struct hash_record *rec = hash_rec + head;
  97   if (*to_delete)                               /* unlink the original record */
  98   {
  99     uint prev_id = rec->prev & ((1<<15)-1);
 100     if (rec->prev & (1<<15))                    /* was a head */
 101       hash_tab[prev_id] = 0;
 102     else                                        /* thanks to the ordering, this was a tail */
 103       hash_rec[prev_id].next = 0;
 104   }
 105   rec->next = hash_tab[hash];
 106   rec->prev = (1<<15) | hash;
 107   hash_rec[rec->next].prev = head;
 108   hash_tab[hash] = head;                        /* add the new record before the link-list */
 109
 110   if (++head >= HASH_RECORDS)                   /* circular buffer, reuse old records, 0 is unused */
 111   {
 112     head = 1;
 113     *to_delete = 1;
 114   }
 115   return head;
 116 }
 117
 118 static inline byte *
 119 dump_unary_value(byte *out, uint l)
 120 {
 121   while (l > 255)
 122   {
 123     l -= 255;
 124     *out++ = 0;
 125   }
 126   *out++ = l;
 127   return out;
 128 }
 129
 130 static byte *
 131 flush_copy_command(uint bof, byte *out, const byte *start, uint len)
 132 {
 133   if (bof && len <= 238)
 134     *out++ = len + 17;
 135   else if (len < 4)
 136   {
 137     /* cannot happen when !!bof */
 138     out[-2] |= len;                             /* invariant: lowest 2 bits 2 bytes back */
 139 #ifdef  CPU_ALLOW_UNALIGNED
 140     * (u32*) out = * (u32*) start;
 141     return out + len;
 142 #else
 143     while (len-- > 0)
 144       *out++ = *start++;
 145     return out;
 146 #endif
 147   }
 148   else
 149   {
 150     /* leave 2 least significant bits of out[-2] set to 0 */
 151     if (len <= 18)
 152       *out++ = len - 3;
 153     else
 154     {
 155       *out++ = 0;
 156       out = dump_unary_value(out, len - 18);
 157     }
 158   }
 159   memcpy(out, start, len);
 160   return out + len;
 161 }
 162
 163 int
 164 lizard_compress(const byte *in, uint in_len, byte *out)
 165   /* Requires out being allocated for at least in_len * LIZARD_MAX_MULTIPLY +
 166    * LIZARD_MAX_ADD.  There must be at least LIZARD_NEEDS_CHARS characters
 167    * allocated after in.  Returns the actual compressed length. */
 168 {
 169   hash_ptr_t hash_tab[HASH_SIZE];
 170   struct hash_record hash_rec[HASH_RECORDS];
 171   const byte *in_end = in + in_len;
 172   byte *out_start = out;
 173   const byte *copy_start = in;
 174   uint head = 1;                                        /* 0 in unused */
 175   uint to_delete = 0, bof = 1;
 176   bzero(hash_tab, sizeof(hash_tab));            /* init the hash-table */
 177   while (in < in_end)
 178   {
 179     uint hash = hashf(in);
 180     byte *best = NULL;
 181     uint len = find_match(hash_tab[hash], hash_rec, in, in_end, &best, head);
 182     if (len < 3)
 183 #if 0                   // TODO: now, our routine does not detect matches of length 2
 184       if (len == 2 && (in - best->string - 1) < (1<<10))
 185       { /* pass-thru */ }
 186       else
 187 #endif
 188       {
 189 literal:
 190         head = hash_string(hash_tab, hash, hash_rec, head, &to_delete);
 191         in++;                                   /* add a literal */
 192         continue;
 193       }
 194
 195     if (in + len > in_end)                      /* crop EOF */
 196     {
 197       len = in_end - in;
 198       if (len < 3)
 199         goto literal;
 200     }
 201     /* Record the match.  */
 202     uint copy_len = in - copy_start;
 203     uint is_in_copy_mode = bof || copy_len >= 4;
 204     uint shift = in - best - 1;
 205     /* Try to use a 2-byte sequence.  */
 206 #if 0
 207     if (len == 2)
 208     {
 209       if (is_in_copy_mode || !copy_len)         /* cannot use with 0 copied characters, because this bit pattern is reserved for copy mode */
 210         goto literal;
 211       else
 212         goto dump_2sequence;
 213     } else
 214 #endif
 215     /* now, len >= 3 */
 216     if (shift < (1<<11) && len <= 8)
 217     {
 218       shift |= (len-3 + 2)<<11;
 219 dump_2sequence:
 220       if (copy_len)
 221         out = flush_copy_command(bof, out, copy_start, copy_len);
 222       *out++ = (shift>>6) & ~3;                 /* shift fits into 10 bits */
 223       *out++ = shift & 0xff;
 224     }
 225     else if (len == 3 && is_in_copy_mode)
 226     {
 227       if (shift < (1<<11) + (1<<10))            /* optimisation for length-3 matches after a copy command */
 228       {
 229         shift -= 1<<11;
 230         goto dump_2sequence;                    /* shift has 11 bits and contains also len */
 231       }
 232       else                                      /* avoid 3-sequence compressed to 3 sequence if it can simply be appended */
 233         goto literal;
 234     }
 235     /* We have to use a 3-byte sequence.  */
 236     else
 237     {
 238       if (copy_len)
 239         out = flush_copy_command(bof, out, copy_start, copy_len);
 240       if (shift < (1<<14))
 241       {
 242         if (len <= 33)
 243           *out++ = (1<<5) | (len-2);
 244         else
 245         {
 246           *out++ = 1<<5;
 247           out = dump_unary_value(out, len - 33);
 248         }
 249       }
 250       else /* shift < (1<<15)-1 becase of HASH_RECORDS */
 251       {
 252         shift++;                                /* because shift==0 is reserved for EOF */
 253         byte pos_bit = ((shift>>11) & (1<<3)) | (1<<4);
 254         if (len <= 9)
 255           *out++ = pos_bit | (len-2);
 256         else
 257         {
 258           *out++ = pos_bit;
 259           out = dump_unary_value(out, len - 9);
 260         }
 261       }
 262       *out++ = (shift>>6) & ~3;                 /* rest of shift fits into 14 bits */
 263       *out++ = shift & 0xff;
 264     }
 265     /* Update the hash-table.  */
 266     head = hash_string(hash_tab, hash, hash_rec, head, &to_delete);
 267     for (uint i=1; i<len; i++)
 268       head = hash_string(hash_tab, hashf(in+i), hash_rec, head, &to_delete);
 269     in += len;
 270     copy_start = in;
 271     bof = 0;
 272   }
 273   uint copy_len = in - copy_start;
 274   if (copy_len)
 275     out = flush_copy_command(bof, out, copy_start, copy_len);
 276   *out++ = 17;                                  /* add EOF */
 277   *out++ = 0;
 278   *out++ = 0;
 279   return out - out_start;
 280 }
 281
 282 static inline byte *
 283 read_unary_value(const byte *in, uint *val)
 284 {
 285   uint l = 0;
 286   while (!*in++)
 287     l += 255;
 288   l += in[-1];
 289   *val = l;
 290   return (byte *)in;
 291 }
 292
 293 int
 294 lizard_decompress(const byte *in, byte *out)
 295   /* Requires out being allocated for the decompressed length must be known
 296    * beforehand.  It is desirable to lock the following memory page for
 297    * read-only access to prevent buffer overflow.  Returns the actual
 298    * decompressed length or a negative number when an error has occurred.  */
 299 {
 300   byte *out_start = out;
 301   uint expect_copy_command = 1;
 302   uint len;
 303   if (*in > 17)                                 /* short copy command at BOF */
 304   {
 305     len = *in++ - 17;
 306     goto perform_copy_command;
 307   }
 308   while (1)
 309   {
 310     uint c = *in++;
 311     uint pos;
 312     if (c < 0x10)
 313       if (expect_copy_command == 1)
 314       {
 315         if (!c)
 316         {
 317           in = read_unary_value(in, &len);
 318           len += 18;
 319         }
 320         else
 321           len = c + 3;
 322         goto perform_copy_command;
 323       }
 324       else
 325       {
 326         pos = ((c&0xc)<<6) | *in++;
 327         if (expect_copy_command == 2)
 328         {
 329           pos += 1<<11;
 330           len = 3;
 331         }
 332         else
 333           len = 2;
 334         pos++;
 335       }
 336     else if (c < 0x20)
 337     {
 338       pos = (c&0x8)<<11;
 339       len = c&0x7;
 340       if (!len)
 341       {
 342         in = read_unary_value(in, &len);
 343         len += 9;
 344       }
 345       else
 346         len += 2;
 347       pos |= (*in++ & 0xfc)<<6;
 348       pos |= *in++;
 349       if (!pos)                                 /* EOF */
 350         break;
 351       /* do NOT pos++ */
 352     }
 353     else if (c < 0x40)
 354     {
 355       len = c&0x1f;
 356       if (!len)
 357       {
 358         in = read_unary_value(in, &len);
 359         len += 33;
 360       }
 361       else
 362         len += 2;
 363       pos = (*in++ & 0xfc)<<6;
 364       pos |= *in++;
 365       pos++;
 366     }
 367     else /* high bits encode the length */
 368     {
 369       len = ((c&0xe0)>>5) -2 +3;
 370       pos = (c&0x1c)<<6;
 371       pos |= *in++;
 372       pos++;
 373     }
 374     /* take from the sliding window */
 375     if (len <= pos)
 376     {
 377       memcpy(out, out-pos, len);
 378       out += len;
 379     }
 380     else
 381     {                                           /* overlapping */
 382       for (; len-- > 0; out++)
 383         *out = *(out-pos);
 384       /* It's tempting to use out[-pos] above, but unfortunately it's not the same */
 385     }
 386     /* extract the copy-bits */
 387     len = in[-2] & 0x3;
 388     if (len)
 389     {
 390       expect_copy_command = 0;
 391 #ifdef  CPU_ALLOW_UNALIGNED
 392       * (u32*) out = * (u32*) in;
 393       out += len;
 394       in += len;
 395 #else
 396       while (len-- > 0)
 397         *out++ = *in++;
 398 #endif
 399     }
 400     else
 401       expect_copy_command = 1;
 402     continue;
 403
 404 perform_copy_command:
 405     expect_copy_command = 2;
 406     memcpy(out, in, len);
 407     out += len;
 408     in += len;
 409   }
 410
 411   return out - out_start;
 412 }
 413
 414 /*
 415
 416 Description of the LZO1X format :
 417 =================================
 418
 419 The meaning of the commands depends on the current mode. It can be either
 420 the compressed mode or the copy mode. In some cases, the compressed mode
 421 also distinguishes whether we just left the copy mode or not.
 422
 423 Beginning of file:
 424 ------------------
 425
 426 Start in copy mode. If the first byte is 00010001, it means probably EOF (empty file),
 427 so switch to the compressed mode.  If it is bigger, subtract 17 and copy this number of
 428 the following characters to the output and switch to the compressed mode.
 429 If it is smaller, interpret it as a regular copy mode command.
 430
 431 Compressed mode:
 432 ----------------
 433
 434 Read the first byte of the sequence and determine the type of bit encoding by
 435 looking at the most significant bits.  The sequence is always at least 2 bytes
 436 long.  Decode sequences of these types until the EOF or END marker is read.
 437
 438   length L = length of the text taken from the sliding window
 439
 440     If L=0, then count the number Z of the following zero bytes and add Z*255
 441     to the value of the following non-zero byte.  This allows setting L
 442     arbitrarily high.
 443
 444   position p = relative position of the beginning of the text
 445
 446     Exception: 00010001 00000000 00000000 means EOF
 447
 448   copying C = length 1..3 of copied characters or END=0
 449
 450     C following characters will be copied from the compressed text to the
 451     output.  The number CC is always stored in the 2 least significant bits of
 452     the second last byte of the sequence.
 453
 454     If END is read, the algorithm switches to the copy mode.
 455
 456 pattern                                 length          position
 457
 458 0000ppCC                 pppppppp       2               10 bits         [default interpretation]
 459 0000ppCC                 pppppppp       3               10 bits + 2048  [just after return from copy mode]
 460 0001pLLL L*     ppppppCC pppppppp       3..9 + extend   15 bits         [pos 0 interpreted as EOF]
 461 001LLLLL L*     ppppppCC pppppppp       3..33 + extend  14 bits
 462 LLLpppCC                 pppppppp       3..8            11 bits         [LLL >= 010]
 463
 464 Copy mode:
 465 ----------
 466
 467 Read the first byte and, if the most significant bits are 0000, perform the
 468 following command, otherwise switch to the compressed mode (and evaluate the
 469 command there).
 470
 471 pattern                                 length          position
 472
 473 0000LLLL L*                             4..18 + extend  N/A
 474
 475   Copy L characters from the compressed text to the output.  The overhead for
 476   incompressible strings is only roughly 1/256 + epsilon.
 477
 478 */