lib/lizard.c

   1 /*
   2  *      LiZaRd -- Fast compression method based on Lempel-Ziv 77
   3  *
   4  *      (c) 2004, Robert Spalek <robert@ucw.cz>
   5  *
   6  *      The file format is based on LZO1X and
   7  *      the compression method is based on zlib.
   8  */
   9
  10 #include "lib/lib.h"
  11 #include "lib/lizard.h"
  12
  13 #include <string.h>
  14 #include <stdlib.h>
  15 #include <sys/mman.h>
  16 #include <sys/user.h>
  17 #include <fcntl.h>
  18
  19 typedef u16 hash_ptr_t;
  20 struct hash_record {
  21   /* the position in the original text is implicit; it is computed by locate_string() */
  22   hash_ptr_t next;                      // 0=end
  23   hash_ptr_t prev;                      // high bit: 0=record in array, 1=head in hash-table (i.e. value of hashf)
  24 };
  25
  26 #define HASH_SIZE       (1<<14)         // size of hash-table
  27 #define HASH_RECORDS    (1<<15)         // maximum number of records in hash-table, 0 is unused ==> subtract 1
  28 #define CHAIN_MAX_TESTS         8       // crop longer collision chains
  29 #define CHAIN_GOOD_MATCH        32      // we already have a good match => end
  30
  31 static inline uns
  32 hashf(byte *string)
  33   /* 0..HASH_SIZE-1 */
  34 {
  35     return string[0] ^ (string[1]<<3) ^ (string[2]<<6);
  36 }
  37
  38 static inline byte *
  39 locate_string(byte *string, uns record_id, uns head)
  40   /* The strings are recorded into the hash-table regularly, hence there is no
  41    * need to store the pointer there.  */
  42 {
  43   string += record_id - head;
  44   if (record_id >= head)
  45     string -= HASH_RECORDS-1;
  46   return string;
  47 }
  48
  49 static inline uns
  50 find_match(uns record_id, struct hash_record *hash_rec, byte *string, byte *string_end, byte **best_ptr, uns head)
  51   /* hash_tab[hash] == record_id points to the head of the double-linked
  52    * link-list of strings with the same hash.  The records are statically
  53    * stored in circular array hash_rec (with the 1st entry unused), and the
  54    * pointers are just 16-bit indices.  The strings in every collision chain
  55    * are ordered by age.  */
  56 {
  57   uns count = CHAIN_MAX_TESTS;
  58   uns best_len = 0;
  59   while (record_id && count-- > 0)
  60   {
  61     byte *record_string = locate_string(string, record_id, head);
  62     byte *cmp = record_string;
  63     if (cmp[0] == string[0] && cmp[2] == string[2])
  64     /* implies cmp[1] == string[1] */
  65     {
  66       if (cmp[3] == string[3])
  67       {
  68         cmp += 4;
  69         if (*cmp++ == string[4] && *cmp++ == string[5]
  70             && *cmp++ == string[6] && *cmp++ == string[7])
  71         {
  72           byte *str = string + 8;
  73           while (str <= string_end && *cmp++ == *str++);
  74         }
  75       }
  76       else
  77         cmp += 4;
  78       uns len = cmp - record_string - 1;        /* cmp points 2 characters after the last match */
  79       if (len > best_len)
  80       {
  81         best_len = len;
  82         *best_ptr = record_string;
  83         if (best_len >= CHAIN_GOOD_MATCH)       /* optimization */
  84           break;
  85       }
  86     }
  87     record_id = hash_rec[record_id].next;
  88   }
  89   return best_len;
  90 }
  91
  92 static uns
  93 hash_string(hash_ptr_t *hash_tab, uns hash, struct hash_record *hash_rec, /*byte *string,*/ uns head, uns *to_delete)
  94   /* We reuse hash-records stored in a circular array.  First, delete the old
  95    * one and then add the new one in front of the link-list.  */
  96 {
  97   struct hash_record *rec = hash_rec + head;
  98   if (*to_delete)                               /* unlink the original record */
  99   {
 100     uns prev_id = rec->prev & ((1<<15)-1);
 101     if (rec->prev & (1<<15))                    /* was a head */
 102       hash_tab[prev_id] = 0;
 103     else                                        /* thanks to the ordering, this was a tail */
 104       hash_rec[prev_id].next = 0;
 105   }
 106   rec->next = hash_tab[hash];
 107   rec->prev = (1<<15) | hash;
 108   hash_rec[rec->next].prev = head;
 109   hash_tab[hash] = head;                        /* add the new record before the link-list */
 110
 111   if (++head >= HASH_RECORDS)                   /* circular buffer, reuse old records, 0 is unused */
 112   {
 113     head = 1;
 114     *to_delete = 1;
 115   }
 116   return head;
 117 }
 118
 119 static inline byte *
 120 dump_unary_value(byte *out, uns l)
 121 {
 122   while (l > 255)
 123   {
 124     l -= 255;
 125     *out++ = 0;
 126   }
 127   *out++ = l;
 128   return out;
 129 }
 130
 131 static byte *
 132 flush_copy_command(uns bof, byte *out, byte *start, uns len)
 133 {
 134   if (bof && len <= 238)
 135     *out++ = len + 17;
 136   else if (len < 4)
 137     /* cannot happen when !!bof */
 138     out[-2] |= len;                             /* invariant: lowest 2 bits 2 bytes back */
 139   else
 140   {
 141     /* leave 2 least significant bits of out[-2] set to 0 */
 142     if (len <= 18)
 143       *out++ = len - 3;
 144     else
 145     {
 146       *out++ = 0;
 147       out = dump_unary_value(out, len - 18);
 148     }
 149   }
 150   while (len-- > 0)
 151     *out++ = *start++;
 152   return out;
 153 }
 154
 155 int
 156 lizard_compress(byte *in, uns in_len, byte *out)
 157   /* Requires out being allocated for at least in_len * LIZARD_MAX_MULTIPLY +
 158    * LIZARD_MAX_ADD.  There must be at least LIZARD_NEEDS_CHARS characters
 159    * allocated after in.  Returns the actual compressed length. */
 160 {
 161   hash_ptr_t hash_tab[HASH_SIZE];
 162   struct hash_record hash_rec[HASH_RECORDS];
 163   byte *in_start = in;
 164   byte *in_end = in + in_len;
 165   byte *out_start = out;
 166   byte *copy_start = in;
 167   uns head = 1;                                 /* 0 in unused */
 168   uns to_delete = 0;
 169   bzero(hash_tab, sizeof(hash_tab));            /* init the hash-table */
 170   while (in < in_end)
 171   {
 172     uns hash = hashf(in);
 173     byte *best;
 174     uns len = find_match(hash_tab[hash], hash_rec, in, in_end, &best, head);
 175     if (len < 3)
 176 #if 0                   // TODO: now, our routine does not detect matches of length 2
 177       if (len == 2 && (in - best->string) < ((1<<10) + 1))
 178       { /* pass-thru */ }
 179       else
 180 #endif
 181       {
 182 literal:
 183         head = hash_string(hash_tab, hash, hash_rec, head, &to_delete);
 184         in++;                                   /* add a literal */
 185         continue;
 186       }
 187
 188     if (in + len > in_end)                      /* crop EOF */
 189     {
 190       len = in_end - in;
 191       if (len < 3)
 192         goto literal;
 193     }
 194     /* Record the match.  */
 195     uns copy_len = in - copy_start;
 196     uns is_in_copy_mode = copy_start==in_start || copy_len >= 4;
 197     uns shift = in - best - 1;
 198     /* Try to use a 2-byte sequence.  */
 199 #if 0
 200     if (len == 2)
 201     {
 202       if (is_in_copy_mode || !copy_len)         /* cannot use with 0 copied characters, because this bit pattern is reserved for copy mode */
 203         goto literal;
 204       else
 205         goto dump_2sequence;
 206     } else
 207 #endif
 208     if (len == 3 && is_in_copy_mode && shift >= (1<<11) && shift < (1<<11) + (1<<10))
 209     {
 210       /* optimisation for length-3 matches after a copy command */
 211       shift -= 1<<11;
 212 dump_2sequence:
 213       if (copy_len > 0)
 214         out = flush_copy_command(copy_start==in_start, out, copy_start, copy_len);
 215       *out++ = (shift>>6) & ~3;                 /* shift fits into 10 bits */
 216       *out++ = shift & 0xff;
 217     }
 218     /* now, len >= 3 */
 219     else if (shift < (1<<11) && len <= 8)
 220     {
 221       shift |= (len-3 + 2)<<11;
 222       goto dump_2sequence;                      /* shift has 11 bits and contains also len */
 223     }
 224     /* We have to use a 3-byte sequence.  */
 225     else if (len == 3 && is_in_copy_mode)
 226       /* avoid 3-sequence compressed to 3 sequence if it can simply be appended */
 227       goto literal;
 228     else
 229     {
 230       if (copy_len > 0)
 231         out = flush_copy_command(copy_start==in_start, out, copy_start, copy_len);
 232       if (shift < (1<<14))
 233       {
 234         if (len <= 33)
 235           *out++ = (1<<5) | (len-2);
 236         else
 237         {
 238           *out++ = 1<<5;
 239           out = dump_unary_value(out, len - 33);
 240         }
 241       }
 242       else /* shift < (1<<15)-1 becase of HASH_RECORDS */
 243       {
 244         shift++;                                /* because shift==0 is reserved for EOF */
 245         byte pos_bit = (shift>>11) & (1<<3);
 246         if (len <= 9)
 247           *out++ = (1<<4) | pos_bit | (len-2);
 248         else
 249         {
 250           *out++ = (1<<4) | pos_bit;
 251           out = dump_unary_value(out, len - 9);
 252         }
 253       }
 254       *out++ = (shift>>6) & ~3;                 /* rest of shift fits into 14 bits */
 255       *out++ = shift & 0xff;
 256     }
 257     /* Update the hash-table.  */
 258     head = hash_string(hash_tab, hash, hash_rec, head, &to_delete);
 259     for (uns i=1; i<len; i++)
 260       head = hash_string(hash_tab, hashf(in+i), hash_rec, head, &to_delete);
 261     in += len;
 262     copy_start = in;
 263   }
 264   if (in > in_end)                              /* crop at BOF */
 265     in = in_end;
 266   uns copy_len = in - copy_start;
 267   if (copy_len > 0)
 268     out = flush_copy_command(copy_start==in_start, out, copy_start, copy_len);
 269   *out++ = 17;                                  /* add EOF */
 270   *out++ = 0;
 271   *out++ = 0;
 272   return out - out_start;
 273 }
 274
 275 static inline byte *
 276 read_unary_value(byte *in, uns *val)
 277 {
 278   uns l = 0;
 279   while (!*in++)
 280     l += 255;
 281   l += in[-1];
 282   *val = l;
 283   return in;
 284 }
 285
 286 int
 287 lizard_decompress(byte *in, byte *out)
 288   /* Requires out being allocated for the decompressed length must be known
 289    * beforehand.  It is desirable to lock the following memory page for
 290    * read-only access to prevent buffer overflow.  Returns the actual
 291    * decompressed length or a negative number when an error has occured.  */
 292 {
 293   byte *out_start = out;
 294   uns expect_copy_command = 1;
 295   uns len;
 296   if (*in > 17)                                 /* short copy command at BOF */
 297   {
 298     len = *in++ - 17;
 299     expect_copy_command = 2;
 300     goto perform_copy_command;
 301   }
 302   while (1)
 303   {
 304     uns c = *in++;
 305     uns pos;
 306     if (c < 0x10)
 307       if (expect_copy_command == 1)
 308       {
 309         if (!c)
 310         {
 311           in = read_unary_value(in, &len);
 312           len += 18;
 313         }
 314         else
 315           len = c + 3;
 316         expect_copy_command = 2;
 317         goto perform_copy_command;
 318       }
 319       else
 320       {
 321         pos = ((c&0xc)<<6) | *in++;
 322         if (expect_copy_command == 2)
 323         {
 324           pos += 1<<11;
 325           len = 3;
 326         }
 327         else
 328           len = 2;
 329         pos++;
 330       }
 331     else if (c < 0x20)
 332     {
 333       pos = (c&0x8)<<11;
 334       len = c&0x7;
 335       if (!len)
 336       {
 337         in = read_unary_value(in, &len);
 338         len += 9;
 339       }
 340       else
 341         len += 2;
 342       pos |= (*in++ & 0xfc)<<6;
 343       pos |= *in++;
 344       if (!pos)                                 /* EOF */
 345         break;
 346       /* do NOT pos++ */
 347     }
 348     else if (c < 0x40)
 349     {
 350       len = c&0x1f;
 351       if (!len)
 352       {
 353         in = read_unary_value(in, &len);
 354         len += 33;
 355       }
 356       else
 357         len += 2;
 358       pos = (*in++ & 0xfc)<<6;
 359       pos |= *in++;
 360       pos++;
 361     }
 362     else /* high bits encode the length */
 363     {
 364       len = ((c&0xe0)>>5) -2 +3;
 365       pos = (c&0x1c)<<6;
 366       pos |= *in++;
 367       pos++;
 368     }
 369     /* take from the sliding window */
 370     if (len <= pos)
 371     {
 372       memcpy(out, out-pos, len);
 373       out += len;
 374     }
 375     else
 376     {                                           /* overlapping */
 377       for (; len-- > 0; out++)
 378         *out = out[-pos];
 379     }
 380     /* extract the copy-bits */
 381     len = in[-2] & 0x3;
 382     if (len)
 383     {
 384       expect_copy_command = 0;
 385       while (len-- > 0)
 386         *out++ = *in++;
 387     }
 388     else
 389       expect_copy_command = 1;
 390     continue;
 391
 392 perform_copy_command:
 393     memcpy(out, in, len);
 394     out += len;
 395     in += len;
 396   }
 397
 398   return out - out_start;
 399 }
 400
 401 struct lizard_buffer *
 402 lizard_alloc(uns max_len)
 403 {
 404   int fd = open("/dev/zero", O_RDWR);
 405   if (fd < 0)
 406     die("open(/dev/zero): %m");
 407   struct lizard_buffer *buf = xmalloc(sizeof(struct lizard_buffer));
 408   buf->len = (max_len + 2*PAGE_SIZE - 1) / PAGE_SIZE * PAGE_SIZE;
 409   buf->ptr = mmap(NULL, buf->len, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
 410   if (buf->ptr == MAP_FAILED)
 411     die("mmap(/dev/zero): %m");
 412   return buf;
 413 }
 414
 415 void
 416 lizard_free(struct lizard_buffer *buf)
 417 {
 418   munmap(buf->ptr, buf->len);
 419   xfree(buf);
 420 }
 421
 422 int
 423 lizard_decompress_safe(byte *in, struct lizard_buffer *buf, uns expected_length)
 424   /* Decompresses into buf->ptr and returns the length of the uncompressed
 425    * file.  Negative return values signalise errors.  If something goes wrong
 426    * and buffer would overflow, SIGSEGV is raised.  */
 427 {
 428   uns lock_offset = (expected_length + PAGE_SIZE - 1) / PAGE_SIZE * PAGE_SIZE;
 429   if (lock_offset + PAGE_SIZE > buf->len)
 430     return -1;
 431   mprotect(buf->ptr, lock_offset, PROT_READ | PROT_WRITE);
 432   mprotect(buf->ptr + lock_offset, PAGE_SIZE, PROT_READ);
 433   return lizard_decompress(in, buf->ptr);
 434 }
 435
 436 /*
 437
 438 Description of the LZO1X format :
 439 =================================
 440
 441 Beginning of file:
 442 ------------------
 443
 444 If the first byte is 00010001, it means probably EOF (empty file), so switch
 445 to the compressed mode.  If it is bigger, subtract 17 and copy this number of
 446 the following characters to the ouput and switch to the compressed mode.  If
 447 it is smaller, go to the copy mode.
 448
 449 Compressed mode :
 450 -----------------
 451
 452 Read the first byte of the sequence and determine the type of bit-encoding by
 453 looking at the most significant bits.  The sequence is always at least 2 bytes
 454 long.  Decode sequences of these types until the EOF or END marker is read.
 455
 456   length L = length of the text taken from the sliding window
 457
 458     If L=0, then count the number Z of the following zero bytes and add Z*255
 459     to the value of the following non-zero byte.  This allows setting L
 460     arbitrarily high.
 461
 462   position p = relative position of the beginning of the text
 463
 464     Exception: 00010001 00000000 00000000 means EOF
 465
 466   copying C = length 1..3 of copied characters or END=0
 467
 468     C following characters will be copied from the compressed text to the
 469     output.  The number CC is always stored in the 2 least significant bits of
 470     the second last byte of the sequence.
 471
 472     If END is read, the algorithm switches to the copy mode.
 473
 474 pattern                                 length          position
 475
 476 0000ppCC                 pppppppp       2               10 bits (*)
 477 0001pLLL L*     ppppppCC pppppppp       3..9 + extend   15 bits + EOF
 478 001LLLLL L*     ppppppCC pppppppp       3..33 + extend  14 bits
 479 01\
 480 10 \
 481 11  \
 482 LLLpppCC                 pppppppp       3..8            11 bits
 483
 484 Copy mode :
 485 -----------
 486
 487 Read the first byte and, if the most significant bits are 0000, perform the
 488 following command, otherwise switch to the compressed mode (and evaluate the
 489 command).
 490
 491 pattern                                 length          position
 492
 493 0000LLLL L*                             4..18 + extend  N/A
 494
 495   Copy L characters from the compressed text to the output.  The overhead for
 496   incompressible strings is only roughly 1/256 + epsilon.
 497
 498 (*) After reading one copy command, switch to the compressed mode with the
 499 following "optimisation": the pattern 0000ppCC expands to length 3 instead of 2
 500 and 2048 is added to the position (now it is slightly more than 11 bits),
 501 because a sequence of length 2 would never be used.
 502
 503 */