lib/lizard.c

   1 /*
   2  *      LiZzaRd -- Fast compression method based on Lempel-Ziv 77
   3  *
   4  *      (c) 2004, Robert Spalek <robert@ucw.cz>
   5  *
   6  *      The file format is based on LZO1X and
   7  *      the compression method is based on zlib.
   8  */
   9
  10 #include "lib/lib.h"
  11 #include "lib/lizzard.h"
  12
  13 #include <string.h>
  14
  15 typedef u16 hash_ptr_t;
  16 struct hash_record {
  17   byte *string;                         // in original text
  18   hash_ptr_t next4, next3;              // 0=end
  19 };
  20
  21 #define HASH_SIZE       ((1<<14) + 27)  // prime, size of hash-table
  22 #define HASH_RECORDS    (1<<15)         // maximum number of records in hash-table, 0 is unused ==> subtract 1
  23 #define CHAIN_MAX_TESTS 16              // crop longer collision chains
  24 #define CHAIN_GOOD_MATCH        16      // we already have a good match => end
  25
  26 static inline uns
  27 hashf(byte *string)
  28 {
  29     uns hash;
  30 #ifdef  CPU_ALLOW_UNALIGNED
  31     hash = * (uns *) string;
  32 #elif   CPU_LITTLE_ENDIAN
  33     hash = string[0] | (string[1]<<8) | (string[2]<<16) | (string[3]<<24);
  34 #else
  35     hash = string[3] | (string[2]<<8) | (string[1]<<16) | (string[0]<<24);
  36 #endif
  37     return hash;
  38 }
  39 #ifdef  CPU_LITTLE_ENDIAN
  40 #define MASK_4TH_CHAR   (~(0xff << 24))
  41 #else
  42 #define MASK_4TH_CHAR   (~0xff)
  43 #endif
  44
  45 static inline uns
  46 find_match(uns list_id, hash_ptr_t *hash_tab, struct hash_record *hash_rec, uns hash, byte *string, byte *string_end, byte **best_ptr)
  47   /*
  48    * We heavily rely on gcc optimisations (e.g. list_id is a constant hence
  49    * some branches of the code will be pruned).
  50    *
  51    * hash_tab[hash] points to the head of the link-list of strings with
  52    * the same hash.  The records are statically stored in circular array
  53    * hash_rec (with 1st entry unused), and the pointers are just 16-bit
  54    * indices.  In one array, we store 2 independent link-lists: with 3 and 4
  55    * hashed characters.
  56    *
  57    * The data structure fulfills 2 invariants: the strings in every collision
  58    * chain are ordered by age, and the head of every link-list is placed
  59    * correctly.  The records in the middle of the link-lists may be placed
  60    * wrong, and this is handled here by checking the string order.
  61    */
  62 {
  63   if (list_id == 3)                             /* Find the 1st record in the collision chain.  It is correct.  */
  64     hash &= MASK_4TH_CHAR;
  65   hash %= HASH_SIZE;
  66   if (!hash_tab[hash])
  67     return 0;
  68   struct hash_record *record = hash_rec + hash_tab[hash];
  69
  70   uns count = CHAIN_MAX_TESTS;
  71   uns best_len = 0;
  72   byte *last_cmp = string_end;
  73   hash_ptr_t *last_ptr = NULL;
  74   while (count-- > 0)
  75   {
  76     byte *cmp = record->string;
  77     if (cmp >= last_cmp)                        /* collision chain is ordered by age */
  78     {
  79       *last_ptr = 0;                            /* prune */
  80       break;
  81     }
  82     last_cmp = cmp;
  83     if (cmp[0] == string[0] && cmp[2] == string[2])
  84     {
  85       if (cmp[3] == string[3])
  86       {
  87         /* implies cmp[1] == string[1], becase we hash into 14+eps bits */
  88         cmp += 4;
  89         if (*cmp++ == string[4] && *cmp++ == string[5]
  90             && *cmp++ == string[6] && *cmp++ == string[7])
  91         {
  92           byte *str = string + 8;
  93           while (str <= string_end && *cmp++ == *str++);
  94         }
  95       }
  96       else if (list_id == 3)
  97         /* hashing 3 characters implies cmp[1] == string[1]
  98            hashing 4 characters implies cmp[1] != string[1] */
  99         cmp += 4;
 100       else
 101         goto next_collision;
 102       uns len = cmp - record->string - 1;       /* cmp points 2 characters after the last match */
 103       if (len > best_len)
 104       {
 105         best_len = len;
 106         *best_ptr = record->string;
 107         if (best_len >= CHAIN_GOOD_MATCH)       /* optimisation */
 108           break;
 109       }
 110     }
 111 next_collision:
 112     if (list_id == 3)                           /* move to the next collision */
 113       last_ptr = &record->next3;
 114     else
 115       last_ptr = &record->next4;
 116     uns next = *last_ptr;
 117     if (!next)
 118       break;
 119     record = hash_rec + next;
 120   }
 121   return best_len;
 122 }
 123
 124 static uns
 125 hash_string(byte *string, uns hash, hash_ptr_t *tab3, hash_ptr_t *tab4, struct hash_record *hash_rec, uns head)
 126   /*
 127    * Updates are performed more often than searches, so they are a bit lazy: we
 128    * add new records in front of the link-lists, but only delete them when they
 129    * are the _head_ of some link-list.  This is done in constant time.
 130    *
 131    * By reusing an already allocated space in the circular array we necessarily
 132    * alter records that are referred in the _middle_ of some link-list.
 133    * However the newly inserted string is younger than anything else in the
 134    * table and it can be checked easily.
 135    */
 136 {
 137   struct hash_record *rec = hash_rec + head;
 138   if (rec->string)                              /* unlink the original record if it was the _head_ of some link-list */
 139   {
 140     uns h4 = hashf(rec->string);
 141     uns h3 = h4 & MASK_4TH_CHAR;
 142     h3 %= HASH_SIZE;
 143     h4 %= HASH_SIZE;
 144     if (tab3[h3] == head)
 145       tab3[h3] = 0;
 146     if (tab4[h4] == head)
 147       tab4[h4] = 0;
 148
 149   }
 150   uns h3 = (hash & MASK_4TH_CHAR) % HASH_SIZE;
 151   uns h4 = hash % HASH_SIZE;
 152   rec->string = string;                         /* is younger than any string inserted before */
 153   rec->next3 = tab3[h3];
 154   rec->next4 = tab4[h4];
 155   tab3[h3] = head;                              /* add the new record before the link-list */
 156   tab4[h4] = head;
 157   head++;                                       /* circular buffer, reuse old records, 0 is unused */
 158   if (head >= HASH_RECORDS)
 159     head = 1;
 160   return head;
 161 }
 162
 163 static inline byte *
 164 dump_unary_value(byte *out, uns l)
 165 {
 166   while (l > 255)
 167   {
 168     l -= 255;
 169     *out++ = 0;
 170   }
 171   *out++ = l;
 172   return out;
 173 }
 174
 175 static byte *
 176 flush_copy_command(uns bof, byte *out, byte *start, uns len)
 177 {
 178   if (bof && len <= 238)
 179     *out++ = len + 17;
 180   else if (len < 4)
 181     /* cannot happen when !!bof */
 182     out[-2] |= len;                             /* invariant: lowest 2 bits 2 bytes back */
 183   else
 184   {
 185     /* leave 2 least significant bits of out[-2] set to 0 */
 186     if (len <= 18)
 187       *out++ = len - 3;
 188     else
 189     {
 190       *out++ = 0;
 191       out = dump_unary_value(out, len - 18);
 192     }
 193   }
 194   while (len-- > 0)
 195     *out++ = *start++;
 196   return out;
 197 }
 198
 199 int
 200 lizzard_compress(byte *in, uns in_len, byte *out)
 201   /* Requires out being allocated for at least in_len * LIZZARD_MAX_MULTIPLY +
 202    * LIZZARD_MAX_ADD.  There must be at least LIZZARD_NEEDS_CHARS characters
 203    * allocated after in.  Returns the actual compressed length. */
 204 {
 205   hash_ptr_t hash_tab3[HASH_SIZE], hash_tab4[HASH_SIZE];
 206   struct hash_record hash_rec[HASH_RECORDS];
 207   byte *in_start = in;
 208   byte *in_end = in + in_len;
 209   byte *out_start = out;
 210   byte *copy_start = in;
 211   uns head = 1;                                 /* 0 in unused */
 212   bzero(hash_tab3, sizeof(hash_tab3));          /* init the hash-table */
 213   bzero(hash_tab4, sizeof(hash_tab4));
 214   while (in < in_end)
 215   {
 216     uns hash = hashf(in);
 217     byte *best;
 218     uns len = find_match(4, hash_tab4, hash_rec, hash, in, in_end, &best);
 219     if (len >= 3)
 220       goto match;
 221     len = find_match(3, hash_tab3, hash_rec, hash, in, in_end, &best);
 222
 223 match_found:
 224     if (len >= 3)
 225       goto match;
 226 #if 0                   // TODO: now, our routine does not detect matches of length 2
 227     if (len == 2 && (in - best->string) < ((1<<10) + 1))
 228       goto match;
 229 #endif
 230 literal:
 231     head = hash_string(in, hash, hash_tab3, hash_tab4, hash_rec, head);
 232     in++;                                       /* add a literal */
 233     continue;
 234
 235 match:
 236     if (in + len > in_end)                      /* crop EOF */
 237     {
 238       len = in_end - in;
 239       if (len < 3)
 240         goto match_found;
 241     }
 242     /* Record the match.  */
 243     uns copy_len = in - copy_start;
 244     uns is_in_copy_mode = copy_start==in_start || copy_len >= 4;
 245     uns shift = in - best - 1;
 246     /* Try to use a 2-byte sequence.  */
 247     if (len == 2)
 248     {
 249       if (is_in_copy_mode || !copy_len)         /* cannot use with 0 copied characters, because this bit pattern is reserved for copy mode */
 250         goto literal;
 251
 252 dump_2sequence:
 253       if (copy_len > 0)
 254         out = flush_copy_command(copy_start==in_start, out, copy_start, copy_len);
 255       *out++ = (shift>>6) & ~3;                 /* shift fits into 10 bits */
 256       *out++ = shift & 0xff;
 257     }
 258     else if (len == 3 && is_in_copy_mode && shift >= (1<<11) && shift < (1<<11) + (1<<10))
 259     {
 260       /* optimisation for length-3 matches after a copy command */
 261       shift -= 1<<11;
 262       goto dump_2sequence;
 263     }
 264     /* now, len >= 3 */
 265     else if (shift < (1<<11) && len <= 8)
 266     {
 267       shift |= (len-3 + 2)<<11;
 268       goto dump_2sequence;                      /* shift has 11 bits and contains also len */
 269     }
 270     /* We have to use a 3-byte sequence.  */
 271     else if (len == 3 && is_in_copy_mode)
 272       /* avoid 3-sequence compressed to 3 sequence if it can simply be appended */
 273       goto literal;
 274     else
 275     {
 276       if (copy_len > 0)
 277         out = flush_copy_command(copy_start==in_start, out, copy_start, copy_len);
 278       if (shift < (1<<14))
 279       {
 280         if (len <= 33)
 281           *out++ = (1<<5) | (len-2);
 282         else
 283         {
 284           *out++ = 1<<5;
 285           out = dump_unary_value(out, len - 33);
 286         }
 287       }
 288       else /* shift < (1<<15)-1 becase of HASH_RECORDS */
 289       {
 290         shift++;                                /* because shift==0 is reserved for EOF */
 291         byte pos_bit = (shift>>11) & (1<<3);
 292         if (len <= 9)
 293           *out++ = (1<<4) | pos_bit | (len-2);
 294         else
 295         {
 296           *out++ = (1<<4) | pos_bit;
 297           out = dump_unary_value(out, len - 9);
 298         }
 299       }
 300       *out++ = (shift>>6) & ~3;                 /* rest of shift fits into 14 bits */
 301       *out++ = shift & 0xff;
 302     }
 303     /* Update the hash-table.  */
 304     head = hash_string(in, hash, hash_tab3, hash_tab4, hash_rec, head);
 305     for (uns i=1; i<len; i++)
 306       head = hash_string(in+i, hashf(in+i), hash_tab3, hash_tab4, hash_rec, head);
 307     in += len;
 308     copy_start = in;
 309   }
 310   if (in > in_end)                              /* crop at BOF */
 311     in = in_end;
 312   uns copy_len = in - copy_start;
 313   if (copy_len > 0)
 314     out = flush_copy_command(copy_start==in_start, out, copy_start, copy_len);
 315   *out++ = 17;                                  /* add EOF */
 316   *out++ = 0;
 317   *out++ = 0;
 318   return out - out_start;
 319 }
 320
 321 static inline byte *
 322 read_unary_value(byte *in, uns *val)
 323 {
 324   uns l = 0;
 325   while (!*in++)
 326     l += 255;
 327   l += in[-1];
 328   *val = l;
 329   return in;
 330 }
 331
 332 int
 333 lizzard_decompress(byte *in, byte *out)
 334   /* Requires out being allocated for the decompressed length must be known
 335    * beforehand.  It is desirable to lock the following memory page for
 336    * read-only access to prevent buffer overflow.  Returns the actual
 337    * decompressed length or a negative number when an error has occured.  */
 338 {
 339   byte *out_start = out;
 340   uns expect_copy_command = 1;
 341   uns len;
 342   if (*in > 17)                                 /* short copy command at BOF */
 343   {
 344     len = *in++ - 17;
 345     expect_copy_command = 2;
 346     goto perform_copy_command;
 347   }
 348   while (1)
 349   {
 350     uns c = *in++;
 351     uns pos;
 352     if (c < 0x10)
 353       if (expect_copy_command == 1)
 354       {
 355         if (!c)
 356         {
 357           in = read_unary_value(in, &len);
 358           len += 18;
 359         }
 360         else
 361           len = c + 3;
 362         expect_copy_command = 2;
 363         goto perform_copy_command;
 364       }
 365       else
 366       {
 367         pos = ((c&0xc)<<6) | *in++;
 368         if (expect_copy_command == 2)
 369         {
 370           pos += 1<<11;
 371           len = 3;
 372         }
 373         else
 374           len = 2;
 375         pos++;
 376       }
 377     else if (c < 0x20)
 378     {
 379       pos = (c&0x8)<<11;
 380       len = c&0x7;
 381       if (!len)
 382       {
 383         in = read_unary_value(in, &len);
 384         len += 9;
 385       }
 386       else
 387         len += 2;
 388       pos |= (*in++ & 0xfc)<<6;
 389       pos |= *in++;
 390       if (!pos)                                 /* EOF */
 391         break;
 392       /* do NOT pos++ */
 393     }
 394     else if (c < 0x40)
 395     {
 396       len = c&0x1f;
 397       if (!len)
 398       {
 399         in = read_unary_value(in, &len);
 400         len += 33;
 401       }
 402       else
 403         len += 2;
 404       pos = (*in++ & 0xfc)<<6;
 405       pos |= *in++;
 406       pos++;
 407     }
 408     else /* high bits encode the length */
 409     {
 410       len = ((c&0xe0)>>5) -2 +3;
 411       pos = (c&0x1c)<<6;
 412       pos |= *in++;
 413       pos++;
 414     }
 415     /* take from the sliding window */
 416     if (len <= pos)
 417     {
 418       memcpy(out, out-pos, len);
 419       out += len;
 420     }
 421     else
 422     {                                           /* overlapping */
 423       for (; len-- > 0; out++)
 424         *out = out[-pos];
 425     }
 426     /* extract the copy-bits */
 427     len = in[-2] & 0x3;
 428     if (len)
 429     {
 430       expect_copy_command = 0;
 431       while (len-- > 0)
 432         *out++ = *in++;
 433     }
 434     else
 435       expect_copy_command = 1;
 436     continue;
 437
 438 perform_copy_command:
 439     memcpy(out, in, len);
 440     out += len;
 441     in += len;
 442   }
 443
 444   return out - out_start;
 445 }
 446
 447 /*
 448
 449 Description of the LZO1X format :
 450 =================================
 451
 452 Beginning of file:
 453 ------------------
 454
 455 If the first byte is 00010001, it means probably EOF (empty file), so switch
 456 to the compressed mode.  If it is bigger, subtract 17 and copy this number of
 457 the following characters to the ouput and switch to the compressed mode.  If
 458 it is smaller, go to the copy mode.
 459
 460 Compressed mode :
 461 -----------------
 462
 463 Read the first byte of the sequence and determine the type of bit-encoding by
 464 looking at the most significant bits.  The sequence is always at least 2 bytes
 465 long.  Decode sequences of these types until the EOF or END marker is read.
 466
 467   length L = length of the text taken from the sliding window
 468
 469     If L=0, then count the number Z of the following zero bytes and add Z*255
 470     to the value of the following non-zero byte.  This allows setting L
 471     arbitrarily high.
 472
 473   position p = relative position of the beginning of the text
 474
 475     Exception: 00010001 00000000 00000000 means EOF
 476
 477   copying C = length 1..3 of copied characters or END=0
 478
 479     C following characters will be copied from the compressed text to the
 480     output.  The number CC is always stored in the 2 least significant bits of
 481     the second last byte of the sequence.
 482
 483     If END is read, the algorithm switches to the copy mode.
 484
 485 pattern                                 length          position
 486
 487 0000ppCC                 pppppppp       2               10 bits (*)
 488 0001pLLL L*     ppppppCC pppppppp       3..9 + extend   15 bits + EOF
 489 001LLLLL L*     ppppppCC pppppppp       3..33 + extend  14 bits
 490 01\
 491 10 \
 492 11  \
 493 LLLpppCC                 pppppppp       3..8            11 bits
 494
 495 Copy mode :
 496 -----------
 497
 498 Read the first byte and, if the most significant bits are 0000, perform the
 499 following command, otherwise switch to the compressed mode (and evaluate the
 500 command).
 501
 502 pattern                                 length          position
 503
 504 0000LLLL L*                             4..18 + extend  N/A
 505
 506   Copy L characters from the compressed text to the output.  The overhead for
 507   incompressible strings is only roughly 1/256 + epsilon.
 508
 509 (*) After reading one copy command, switch to the compressed mode with the
 510 following "optimisation": the pattern 0000ppCC expands to length 3 instead of 2
 511 and 2048 is added to the position (now it is slightly more than 11 bits),
 512 because a sequence of length 2 would never be used.
 513
 514 */