]> mj.ucw.cz Git - libucw.git/blobdiff - lib/bucket.c
`buckettool -c' (cat) now separates buckets by an empty line.
[libucw.git] / lib / bucket.c
index 2e6db98bf47cd4a014f9115f86a11fc896e06a6f..f81f03e8d3f5444d0af9d3176edf932f0fe1b5a3 100644 (file)
 /*
  *     Sherlock Library -- Object Buckets
  *
- *     (c) 2001 Martin Mares <mj@ucw.cz>
+ *     (c) 2001--2004 Martin Mares <mj@ucw.cz>
+ *     (c) 2004 Robert Spalek <robert@ucw.cz>
  *
- *     Warning: Touches internals of the fb-file module!
+ *     This software may be freely distributed and used according to the terms
+ *     of the GNU Lesser General Public License.
  */
 
+#undef LOCAL_DEBUG
+
 #include "lib/lib.h"
 #include "lib/bucket.h"
 #include "lib/fastbuf.h"
+#include "lib/lfs.h"
+#include "lib/conf.h"
 
 #include <string.h>
+#include <stdlib.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <sys/file.h>
+#include <alloca.h>
 
 static int obuck_fd;
-static struct fastbuf *obuck_fb;
-static struct obuck_header obuck_hdr;
-static sh_off_t start_of_this, start_of_next;
-static char *obuck_name = "db/objects";                /* FIXME */
+static struct obuck_header obuck_hdr, obuck_create_hdr;
+static sh_off_t bucket_find_pos;
+static struct fastbuf *obuck_write_fb;
 
-void
-obuck_init(int writeable)
-{
-  obuck_fb = bopen(obuck_name, (writeable ? O_RDWR | O_CREAT : O_RDONLY), 65536);
-  obuck_fd = obuck_fb->fd;
-}
+/*** Configuration ***/
 
-void
-obuck_cleanup(void)
+byte *obuck_name = "not/configured";
+static uns obuck_io_buflen = 65536;
+static int obuck_shake_buflen = 1048576;
+static uns obuck_shake_security;
+static uns obuck_slurp_buflen = 65536;
+
+static struct cfitem obuck_config[] = {
+  { "Buckets",         CT_SECTION,     NULL },
+  { "BucketFile",      CT_STRING,      &obuck_name },
+  { "BufSize",         CT_INT,         &obuck_io_buflen },
+  { "ShakeBufSize",    CT_INT,         &obuck_shake_buflen },
+  { "ShakeSecurity",   CT_INT,         &obuck_shake_security },
+  { "SlurpBufSize",    CT_INT,         &obuck_slurp_buflen },
+  { NULL,              CT_STOP,        NULL }
+};
+
+static void CONSTRUCTOR obuck_init_config(void)
 {
-  bclose(obuck_fb);
+  cf_register(obuck_config);
 }
 
+/*** Internal operations ***/
+
 static void
-obuck_broken(char *msg)
+obuck_broken(char *msg, sh_off_t pos)
 {
-  die("Object pool corrupted: %s", msg);       /* FIXME */
+  die("Object pool corrupted: %s (pos=%Lx)", msg, (long long) pos);
 }
 
+/*
+ *  We need several types of locks:
+ *
+ *     Read lock       reading parts of bucket file
+ *     Write lock      any write operations
+ *     Append lock     appending to the end of the file
+ *     Scan lock       reading parts which we are certain they exist
+ *
+ *  Multiple read and scan locks can co-exist together.
+ *  Scan locks can co-exist with an append lock.
+ *  There can be at most one write/append lock at a time.
+ *
+ *  These lock types map to a pair of normal read-write locks which
+ *  we represent as fcntl() locks on the first and second byte of the
+ *  bucket file. [We cannot use flock() since it happily permits
+ *  locking a shared fd (e.g., after fork()) multiple times at it also
+ *  doesn't offer multiple locks on a single file.]
+ *
+ *                     byte0           byte1
+ *     Read            <read>          <read>
+ *     Write           <write>         <write>
+ *     Append          <write>         -
+ *     Scan            -               <read>
+ */
+
 static inline void
+obuck_do_lock(int type, int start, int len)
+{
+  struct flock fl;
+
+  fl.l_type = type;
+  fl.l_whence = SEEK_SET;
+  fl.l_start = start;
+  fl.l_len = len;
+  if (fcntl(obuck_fd, F_SETLKW, &fl) < 0)
+    die("fcntl lock: %m");
+}
+
+inline void
 obuck_lock_read(void)
 {
-  flock(obuck_fd, LOCK_SH);
+  obuck_do_lock(F_RDLCK, 0, 2);
 }
 
-static inline void
+inline void
 obuck_lock_write(void)
 {
-  flock(obuck_fd, LOCK_EX);
+  obuck_do_lock(F_WRLCK, 0, 2);
+}
+
+static inline void
+obuck_lock_append(void)
+{
+  obuck_do_lock(F_WRLCK, 0, 1);
 }
 
 static inline void
+obuck_lock_read_to_scan(void)
+{
+  obuck_do_lock(F_UNLCK, 0, 1);
+}
+
+inline void
 obuck_unlock(void)
 {
-  flock(obuck_fd, LOCK_UN);
+  obuck_do_lock(F_UNLCK, 0, 2);
+}
+
+/*** FastIO emulation ***/
+
+struct fb_bucket {
+  struct fastbuf fb;
+  sh_off_t start_pos;
+  uns bucket_size;
+  byte buffer[0];
+};
+#define FB_BUCKET(f) ((struct fb_bucket *)(f)->is_fastbuf)
+
+static int obuck_fb_count;
+
+static void
+obuck_fb_close(struct fastbuf *f)
+{
+  obuck_fb_count--;
+  xfree(f);
+}
+
+/* We need to use pread/pwrite since we work on fd's shared between processes */
+
+static int
+obuck_fb_refill(struct fastbuf *f)
+{
+  uns remains, bufsize, size, datasize;
+
+  remains = FB_BUCKET(f)->bucket_size - (uns)f->pos;
+  if (!remains)
+    return 0;
+  f->buffer = FB_BUCKET(f)->buffer;    /* Could have been trimmed by bdirect_read_commit_modified() */
+  bufsize = f->bufend - f->buffer;
+  sh_off_t start = FB_BUCKET(f)->start_pos;
+  sh_off_t pos = start + sizeof(struct obuck_header) + f->pos;
+  if (remains <= bufsize)
+    {
+      datasize = remains;
+      size = start + obuck_bucket_size(FB_BUCKET(f)->bucket_size) - pos;
+    }
+  else
+    size = datasize = bufsize;
+  int l = sh_pread(obuck_fd, f->buffer, size, pos);
+  if (l < 0)
+    die("Error reading bucket: %m");
+  if ((unsigned) l != size)
+    obuck_broken("Short read", FB_BUCKET(f)->start_pos);
+  f->bptr = f->buffer;
+  f->bstop = f->buffer + datasize;
+  f->pos += datasize;
+  if (datasize < size)
+    {
+      if (GET_U32(f->buffer + size - 4) != OBUCK_TRAILER)
+       obuck_broken("Missing trailer", FB_BUCKET(f)->start_pos);
+    }
+  return datasize;
+}
+
+static void
+obuck_fb_spout(struct fastbuf *f)
+{
+  int l = f->bptr - f->buffer;
+  char *c = f->buffer;
+
+  while (l)
+    {
+      int z = sh_pwrite(obuck_fd, c, l, FB_BUCKET(f)->start_pos + sizeof(struct obuck_header) + f->pos);
+      if (z <= 0)
+       die("Error writing bucket: %m");
+      f->pos += z;
+      l -= z;
+      c += z;
+    }
+  f->bptr = f->buffer;
+}
+
+/*** Exported functions ***/
+
+void
+obuck_init(int writeable)
+{
+  sh_off_t size;
+
+  obuck_fd = sh_open(obuck_name, (writeable ? O_RDWR | O_CREAT : O_RDONLY), 0666);
+  if (obuck_fd < 0)
+    die("Unable to open bucket file %s: %m", obuck_name);
+  obuck_lock_read();
+  size = sh_seek(obuck_fd, 0, SEEK_END);
+  if (size)
+    {
+      /* If the bucket pool is not empty, check consistency of its end */
+      u32 check;
+      if (sh_pread(obuck_fd, &check, 4, size-4) != 4 ||
+         check != OBUCK_TRAILER)
+       obuck_broken("Missing trailer of last object", size - 4);
+    }
+  obuck_unlock();
+}
+
+void
+obuck_cleanup(void)
+{
+  close(obuck_fd);
+  if (obuck_fb_count)
+    log(L_ERROR, "Bug: Unbalanced bucket opens/closes: %d streams remain", obuck_fb_count);
+  if (obuck_write_fb)
+    log(L_ERROR, "Bug: Forgot to close bucket write stream");
+}
+
+void
+obuck_sync(void)
+{
+  if (obuck_write_fb)
+    bflush(obuck_write_fb);
+  fsync(obuck_fd);
 }
 
 static void
-obuck_fetch_header(oid_t oid)
+obuck_get(oid_t oid)
 {
-  start_of_this = ((sh_off_t) oid) << OBUCK_SHIFT;
-  bsetpos(obuck_fb, start_of_this);
-  bread(obuck_fb, &obuck_hdr, sizeof(obuck_hdr));
+  bucket_find_pos = obuck_get_pos(oid);
+  if (sh_pread(obuck_fd, &obuck_hdr, sizeof(obuck_hdr), bucket_find_pos) != sizeof(obuck_hdr))
+    obuck_broken("Short header read", bucket_find_pos);
   if (obuck_hdr.magic != OBUCK_MAGIC)
-    obuck_broken("Missing magic number");
+    obuck_broken("Missing magic number", bucket_find_pos);
   if (obuck_hdr.oid == OBUCK_OID_DELETED)
-    obuck_broken("Access to deleted bucket");
+    obuck_broken("Access to deleted bucket", bucket_find_pos);
   if (obuck_hdr.oid != oid)
-    obuck_broken("Invalid backlink");
+    obuck_broken("Invalid backlink", bucket_find_pos);
 }
 
-struct fastbuf *
-obuck_fetch(struct obuck_header *hdrp)
+void
+obuck_find_by_oid(struct obuck_header *hdrp)
 {
+  oid_t oid = hdrp->oid;
+
+  ASSERT(oid < OBUCK_OID_FIRST_SPECIAL);
   obuck_lock_read();
-  obuck_fetch_header(hdrp->oid);
+  obuck_get(oid);
+  obuck_unlock();
   memcpy(hdrp, &obuck_hdr, sizeof(obuck_hdr));
-  return obuck_fb;
 }
 
-void
-obuck_fetch_abort(struct fastbuf *b UNUSED)
+int
+obuck_find_first(struct obuck_header *hdrp, int full)
 {
-  obuck_unlock();
+  bucket_find_pos = 0;
+  obuck_hdr.magic = 0;
+  return obuck_find_next(hdrp, full);
 }
 
-void
-obuck_fetch_end(struct fastbuf *b UNUSED)
+int
+obuck_find_next(struct obuck_header *hdrp, int full)
 {
-  if (bgetl(b) != OBUCK_TRAILER)
-    obuck_broken("Corrupted trailer");
-  obuck_unlock();
+  int c;
+
+  for(;;)
+    {
+      if (obuck_hdr.magic)
+       bucket_find_pos += obuck_bucket_size(obuck_hdr.length);
+      obuck_lock_read();
+      c = sh_pread(obuck_fd, &obuck_hdr, sizeof(obuck_hdr), bucket_find_pos);
+      obuck_unlock();
+      if (!c)
+       return 0;
+      if (c != sizeof(obuck_hdr))
+       obuck_broken("Short header read", bucket_find_pos);
+      if (obuck_hdr.magic != OBUCK_MAGIC)
+       obuck_broken("Missing magic number", bucket_find_pos);
+      if (obuck_hdr.oid != OBUCK_OID_DELETED || full)
+       {
+         memcpy(hdrp, &obuck_hdr, sizeof(obuck_hdr));
+         return 1;
+       }
+    }
 }
 
 struct fastbuf *
-obuck_write(void)
+obuck_fetch(void)
 {
-  obuck_lock_write();
-  bseek(obuck_fb, 0, SEEK_END);
-  start_of_this = btell(obuck_fb);
-  if (start_of_this & (OBUCK_ALIGN - 1))
-    obuck_broken("Misaligned file");
-  obuck_hdr.magic = 0;
-  obuck_hdr.oid = start_of_this >> OBUCK_SHIFT;
-  obuck_hdr.length = obuck_hdr.orig_length = 0;
-  bwrite(obuck_fb, &obuck_hdr, sizeof(obuck_hdr));
-  return obuck_fb;
+  struct fastbuf *b;
+  uns official_buflen = ALIGN(MIN(obuck_hdr.length, obuck_io_buflen), OBUCK_ALIGN);
+  uns real_buflen = official_buflen + OBUCK_ALIGN;
+
+  b = xmalloc(sizeof(struct fb_bucket) + real_buflen);
+  b->buffer = b->bptr = b->bstop = FB_BUCKET(b)->buffer;
+  b->bufend = b->buffer + official_buflen;
+  b->name = "bucket-read";
+  b->pos = 0;
+  b->refill = obuck_fb_refill;
+  b->spout = NULL;
+  b->seek = NULL;
+  b->close = obuck_fb_close;
+  b->config = NULL;
+  b->can_overwrite_buffer = 2;
+  FB_BUCKET(b)->start_pos = bucket_find_pos;
+  FB_BUCKET(b)->bucket_size = obuck_hdr.length;
+  obuck_fb_count++;
+  return b;
+}
+
+oid_t
+obuck_predict_last_oid(void)
+{
+  sh_off_t size = sh_seek(obuck_fd, 0, SEEK_END);
+  return (oid_t)(size >> OBUCK_SHIFT);
+}
+
+struct fastbuf *
+obuck_create(u32 type)
+{
+  ASSERT(!obuck_write_fb);
+
+  obuck_lock_append();
+  sh_off_t start = sh_seek(obuck_fd, 0, SEEK_END);
+  if (start & (OBUCK_ALIGN - 1))
+    obuck_broken("Misaligned file", start);
+  obuck_create_hdr.magic = OBUCK_INCOMPLETE_MAGIC;
+  obuck_create_hdr.oid = start >> OBUCK_SHIFT;
+  obuck_create_hdr.length = 0;
+  obuck_create_hdr.type = type;
+
+  struct fastbuf *b = xmalloc(sizeof(struct fb_bucket) + obuck_io_buflen);
+  obuck_write_fb = b;
+  b->buffer = FB_BUCKET(b)->buffer;
+  b->bptr = b->bstop = b->buffer;
+  b->bufend = b->buffer + obuck_io_buflen;
+  b->pos = -(int)sizeof(obuck_create_hdr);
+  b->name = "bucket-write";
+  b->refill = NULL;
+  b->spout = obuck_fb_spout;
+  b->seek = NULL;
+  b->close = NULL;
+  b->config = NULL;
+  b->can_overwrite_buffer = 0;
+  FB_BUCKET(b)->start_pos = start;
+  FB_BUCKET(b)->bucket_size = 0;
+  bwrite(b, &obuck_create_hdr, sizeof(obuck_create_hdr));
+
+  return b;
 }
 
 void
-obuck_write_end(struct fastbuf *b UNUSED, struct obuck_header *hdrp)
+obuck_create_end(struct fastbuf *b, struct obuck_header *hdrp)
 {
-  int pad;
-  obuck_hdr.magic = OBUCK_MAGIC;
-  obuck_hdr.length = obuck_hdr.orig_length = btell(obuck_fb) - start_of_this - sizeof(obuck_hdr);
-  bputl(obuck_fb, OBUCK_TRAILER);
-  pad = (OBUCK_ALIGN - sizeof(obuck_hdr) - obuck_hdr.length - 4) & (OBUCK_ALIGN - 1);
+  ASSERT(b == obuck_write_fb);
+  obuck_write_fb = NULL;
+
+  obuck_create_hdr.magic = OBUCK_MAGIC;
+  obuck_create_hdr.length = btell(b);
+  int pad = (OBUCK_ALIGN - sizeof(obuck_create_hdr) - obuck_create_hdr.length - 4) & (OBUCK_ALIGN - 1);
   while (pad--)
-    bputc(obuck_fb, 0);
-  bflush(obuck_fb);
-  bsetpos(obuck_fb, start_of_this);
-  /* FIXME: Can be replaced with single pwrite */
-  bwrite(obuck_fb, &obuck_hdr, sizeof(obuck_hdr));
-  bflush(obuck_fb);
+    bputc(b, 0);
+  bputl(b, OBUCK_TRAILER);
+  bflush(b);
+  ASSERT(!((FB_BUCKET(b)->start_pos + sizeof(obuck_create_hdr) + b->pos) & (OBUCK_ALIGN - 1)));
+  if (sh_pwrite(obuck_fd, &obuck_create_hdr, sizeof(obuck_create_hdr), FB_BUCKET(b)->start_pos) != sizeof(obuck_create_hdr))
+    die("Bucket header update failed: %m");
   obuck_unlock();
-  memcpy(hdrp, &obuck_hdr, sizeof(obuck_hdr));
+  memcpy(hdrp, &obuck_create_hdr, sizeof(obuck_create_hdr));
+  xfree(b);
 }
 
 void
 obuck_delete(oid_t oid)
 {
   obuck_lock_write();
-  obuck_fetch_header(oid);
+  obuck_get(oid);
   obuck_hdr.oid = OBUCK_OID_DELETED;
-  bflush(obuck_fb);
-  bsetpos(obuck_fb, start_of_this);
-  bwrite(obuck_fb, &obuck_hdr, sizeof(obuck_hdr));
-  bflush(obuck_fb);
+  sh_pwrite(obuck_fd, &obuck_hdr, sizeof(obuck_hdr), bucket_find_pos);
   obuck_unlock();
 }
 
-struct fastbuf *
-obuck_walk_init(void)
+/*** Fast reading of the whole pool ***/
+
+static struct fastbuf *obuck_rpf;
+static uns slurp_remains;
+static sh_off_t slurp_start, slurp_current, slurp_end;
+
+static int
+obuck_slurp_refill(struct fastbuf *f)
 {
-  start_of_this = start_of_next = 0;
-  obuck_lock_read();
-  return obuck_fb;
+  if (!slurp_remains)
+    return 0;
+  uns l = bdirect_read_prepare(obuck_rpf, &f->buffer);
+  if (!l)
+    obuck_broken("Incomplete object", slurp_start);
+  l = MIN(l, slurp_remains);
+  /* XXX: This probably should be bdirect_read_commit_modified() in some cases,
+   *      but it doesn't hurt since we aren't going to seek.
+   */
+  bdirect_read_commit(obuck_rpf, f->buffer + l);
+  slurp_remains -= l;
+  f->bptr = f->buffer;
+  f->bufend = f->bstop = f->buffer + l;
+  return 1;
 }
 
 struct fastbuf *
-obuck_walk_next(struct fastbuf *b, struct obuck_header *hdrp)
+obuck_slurp_pool(struct obuck_header *hdrp)
 {
-  int c;
+  static struct fastbuf limiter;
+  uns l;
 
-restart:
-  start_of_this = start_of_next;
-  bsetpos(b, start_of_this);
-  c = bgetc(b);
-  if (c < 0)
-    return NULL;
-  bungetc(b, c);
-  bread(b, &obuck_hdr, sizeof(obuck_hdr));
-  if (obuck_hdr.magic != OBUCK_MAGIC)
-    obuck_broken("Missing magic number");
-  start_of_next = (start_of_this + sizeof(obuck_hdr) + obuck_hdr.orig_length +
-       4 + OBUCK_ALIGN - 1) & ~((sh_off_t)(OBUCK_ALIGN - 1));
-  if (obuck_hdr.oid == OBUCK_OID_DELETED)
-    goto restart;
-  memcpy(hdrp, &obuck_hdr, sizeof(obuck_hdr));
-  return b;
+  do
+    {
+      if (!obuck_rpf)
+       {
+         obuck_lock_read();
+         obuck_rpf = bopen(obuck_name, O_RDONLY, obuck_slurp_buflen);
+         slurp_end = bfilesize(obuck_rpf);
+         obuck_lock_read_to_scan();
+       }
+      else
+       {
+         bsetpos(obuck_rpf, slurp_current - 4);
+         if (bgetl(obuck_rpf) != OBUCK_TRAILER)
+           obuck_broken("Missing trailer", slurp_start);
+       }
+      slurp_start = btell(obuck_rpf);
+      if (slurp_start < slurp_end)
+       l = bread(obuck_rpf, hdrp, sizeof(struct obuck_header));
+      else
+       l = 0;
+      if (!l)
+       {
+         bclose(obuck_rpf);
+         obuck_rpf = NULL;
+         obuck_unlock();
+         return NULL;
+       }
+      if (l != sizeof(struct obuck_header))
+       obuck_broken("Short header read", slurp_start);
+      if (hdrp->magic != OBUCK_MAGIC)
+       obuck_broken("Missing magic number", slurp_start);
+      slurp_current = slurp_start + obuck_bucket_size(hdrp->length);
+    }
+  while (hdrp->oid == OBUCK_OID_DELETED);
+  if (obuck_get_pos(hdrp->oid) != slurp_start)
+    obuck_broken("Invalid backlink", slurp_start);
+  slurp_remains = hdrp->length;
+  limiter.bptr = limiter.bstop = limiter.buffer = limiter.bufend = NULL;
+  limiter.name = "Bucket";
+  limiter.pos = 0;
+  limiter.refill = obuck_slurp_refill;
+  limiter.can_overwrite_buffer = obuck_rpf->can_overwrite_buffer;
+  return &limiter;
+}
+
+/*** Shakedown ***/
+
+static inline void
+shake_write(void *addr, int len, sh_off_t pos)
+{
+  int l = sh_pwrite(obuck_fd, addr, len, pos);
+  if (l != len)
+    {
+      if (l < 0)
+       die("obuck_shakedown write error: %m");
+      else
+       die("obuck_shakedown write error: disk full");
+    }
+}
+
+static inline void
+shake_sync(void)
+{
+  if (obuck_shake_security > 1)
+    fdatasync(obuck_fd);
+}
+
+static void
+shake_write_backup(sh_off_t bpos, byte *norm_buf, int norm_size, byte *fragment, int frag_size, sh_off_t frag_pos, int more_size)
+{
+  struct obuck_header *bhdr;
+  int boff = 0;
+  int l;
+  oid_t old_oid;
+
+  /* First of all, the "normal" part -- everything that will be written in this pass */
+  DBG("Backing up first round of changes at position %Lx + %x", (long long) bpos, norm_size);
+  while (boff < norm_size)
+    {
+      /* This needn't be optimized for speed. */
+      bhdr = (struct obuck_header *) (norm_buf + boff);
+      ASSERT(bhdr->magic == OBUCK_MAGIC);
+      l = obuck_bucket_size(bhdr->length);
+      old_oid = bhdr->oid;
+      bhdr->oid = bpos >> OBUCK_SHIFT;
+      shake_write(bhdr, l, bpos);
+      bhdr->oid = old_oid;
+      boff += l;
+      bpos += l;
+    }
+
+  /* If we have an incomplete bucket at the end of the buffer, we must copy it as well. */
+  if (more_size)
+    {
+      DBG("Backing up fragment of size %x and %x more", frag_size, more_size);
+
+      /* First the part we already have in the buffer */
+      bhdr = (struct obuck_header *) fragment;
+      ASSERT(bhdr->magic == OBUCK_MAGIC);
+      old_oid = bhdr->oid;
+      bhdr->oid = bpos >> OBUCK_SHIFT;
+      shake_write(bhdr, frag_size, bpos);
+      bhdr->oid = old_oid;
+      bpos += frag_size;
+
+      /* And then the rest, using a small 64K buffer */
+      byte *auxbuf = alloca(65536);
+      l = 0;
+      while (l < more_size)
+       {
+         int j = MIN(more_size-l, 65536);
+         if (sh_pread(obuck_fd, auxbuf, j, frag_pos + frag_size + l) != j)
+           die("obuck_shakedown read error: %m");
+         shake_write(auxbuf, j, bpos);
+         bpos += j;
+         l += j;
+       }
+    }
+}
+
+static void
+shake_erase(sh_off_t start, sh_off_t end)
+{
+  if (start > end)
+    die("shake_erase called with negative length, that's a bug");
+  ASSERT(!(start & (OBUCK_ALIGN-1)) && !(end & (OBUCK_ALIGN-1)));
+  while (start < end)
+    {
+      u32 check = OBUCK_TRAILER;
+      obuck_hdr.magic = OBUCK_MAGIC;
+      obuck_hdr.oid = OBUCK_OID_DELETED;
+      uns len = MIN(0x40000000, end-start);
+      obuck_hdr.length = len - sizeof(obuck_hdr) - 4;
+      DBG("Erasing %08x bytes at %Lx", len, (long long) start);
+      shake_write(&obuck_hdr, sizeof(obuck_hdr), start);
+      start += len;
+      shake_write(&check, 4, start-4);
+    }
 }
 
 void
-obuck_walk_end(struct fastbuf *b UNUSED)
+obuck_shakedown(int (*kibitz)(struct obuck_header *old, oid_t new, byte *buck))
 {
+  byte *buf;                                           /* Shakedown buffer and its size */
+  int buflen = ALIGN(obuck_shake_buflen, OBUCK_ALIGN);
+  byte *msg;                                           /* Error message we will print */
+  sh_off_t rstart, wstart;                             /* Original and new position of buffer start */
+  sh_off_t r_bucket_start, w_bucket_start;             /* Original and new position of the current bucket */
+  int roff, woff;                                      /* Orig/new position of the current bucket relative to buffer start */
+  int rsize;                                           /* Number of original bytes in the buffer */
+  int l;                                               /* Raw size of the current bucket */
+  int changed = 0;                                     /* "Something has been altered" flag */
+  int wrote_anything = 0;                              /* We already did a write to the bucket file */
+  struct obuck_header *rhdr, *whdr;                    /* Original and new address of header of the current bucket */
+  sh_off_t r_file_size;                                        /* Original size of the bucket file */
+  int more;                                            /* How much does the last bucket overlap the buffer */
+
+  buf = xmalloc(buflen);
+  rstart = wstart = 0;
+  roff = woff = rsize = 0;
+
+  /* We need to be the only accessor, all the object ID's are becoming invalid */
+  obuck_lock_write();
+  r_file_size = sh_seek(obuck_fd, 0, SEEK_END);
+  ASSERT(!(r_file_size & (OBUCK_ALIGN - 1)));
+  if (r_file_size >= (0x100000000 << OBUCK_SHIFT) - buflen)
+    die("Bucket file is too large for safe shakedown. Shaking down with Bucket.ShakeSecurity=0 will still work.");
+
+  DBG("Starting shakedown. Buffer size is %d, original length %Lx", buflen, (long long) r_file_size);
+
+  for(;;)
+    {
+      r_bucket_start = rstart + roff;
+      w_bucket_start = wstart + woff;
+      rhdr = (struct obuck_header *)(buf + roff);
+      whdr = (struct obuck_header *)(buf + woff);
+      if (roff == rsize)
+       {
+         more = 0;
+         goto next;
+       }
+      if (rhdr->magic != OBUCK_MAGIC ||
+         rhdr->oid != OBUCK_OID_DELETED && rhdr->oid != (oid_t)(r_bucket_start >> OBUCK_SHIFT))
+       {
+         msg = "header mismatch";
+         goto broken;
+       }
+      l = obuck_bucket_size(rhdr->length);
+      if (l > buflen)
+       {
+         if (rhdr->oid != OBUCK_OID_DELETED)
+           {
+             msg = "bucket longer than ShakeBufSize";
+             goto broken;
+           }
+         /* Empty buckets are allowed to be large, but we need to handle them extra */
+         DBG("Tricking around an extra-large empty bucket at %Lx + %x", (long long)r_bucket_start, l);
+         rsize = roff + l;
+       }
+      else
+       {
+         if (rsize - roff < l)
+           {
+             more = l - (rsize - roff);
+             goto next;
+           }
+         if (GET_U32((byte *)rhdr + l - 4) != OBUCK_TRAILER)
+           {
+             msg = "missing trailer";
+             goto broken;
+           }
+       }
+      if (rhdr->oid != OBUCK_OID_DELETED)
+       {
+         int status = kibitz(rhdr, w_bucket_start >> OBUCK_SHIFT, (byte *)(rhdr+1));
+         if (status)
+           {
+             int lnew = l;
+             if (status > 1)
+               {
+                 /* Changed! Reconstruct the trailer. */
+                 lnew = obuck_bucket_size(rhdr->length);
+                 ASSERT(lnew <= l);
+                 PUT_U32((byte *)rhdr + lnew - 4, OBUCK_TRAILER);
+                 changed = 1;
+               }
+             whdr = (struct obuck_header *)(buf+woff);
+             if (rhdr != whdr)
+               memmove(whdr, rhdr, lnew);
+             whdr->oid = w_bucket_start >> OBUCK_SHIFT;
+             woff += lnew;
+           }
+         else
+           changed = 1;
+       }
+      else
+       {
+         kibitz(rhdr, OBUCK_OID_DELETED, NULL);
+         changed = 1;
+       }
+      roff += l;
+      continue;
+
+    next:
+      if (changed)
+       {
+         /* Write the new contents of the bucket file */
+         if (!wrote_anything)
+           {
+             if (obuck_shake_security)
+               {
+                 /* But first write a backup at the end of the file to ensure nothing can be lost. */
+                 shake_write_backup(r_file_size, buf, woff, buf+roff, rsize-roff, rstart+roff, more);
+                 shake_sync();
+               }
+             wrote_anything = 1;
+           }
+         if (woff)
+           {
+             DBG("Write %Lx %x", wstart, woff);
+             shake_write(buf, woff, wstart);
+             shake_sync();
+           }
+       }
+      else
+       ASSERT(wstart == rstart);
+
+      /* In any case, update the write position */
+      wstart += woff;
+      woff = 0;
+
+      /* Skip what's been read and if there is any fragment at the end of the buffer, move it to the start */
+      rstart += roff;
+      if (more)
+       {
+         memmove(buf, buf+roff, rsize-roff);
+         rsize = rsize-roff;
+       }
+      else
+       rsize = 0;
+
+      /* And refill the buffer */
+      r_bucket_start = rstart+rsize;   /* Also needed for error messages */
+      l = sh_pread(obuck_fd, buf+rsize, MIN(buflen-rsize, r_file_size - r_bucket_start), r_bucket_start);
+      DBG("Read  %Lx %x (%x inherited)", (long long)r_bucket_start, l, rsize);
+      if (l < 0)
+       die("obuck_shakedown read error: %m");
+      if (!l)
+       {
+         if (!more)
+           break;
+         msg = "unexpected EOF";
+         goto broken;
+       }
+      if (l & (OBUCK_ALIGN-1))
+       {
+         msg = "garbage at the end of file";
+         goto broken;
+       }
+      rsize += l;
+      roff = 0;
+    }
+
+  DBG("Finished at position %Lx", (long long) wstart);
+  sh_ftruncate(obuck_fd, wstart);
+  shake_sync();
+
   obuck_unlock();
+  xfree(buf);
+  return;
+
+ broken:
+  log(L_ERROR, "Error during object pool shakedown: %s (pos=%Ld, id=%x), gathering debris",
+      msg, (long long) r_bucket_start, (uns)(r_bucket_start >> OBUCK_SHIFT));
+  /*
+   * We can attempt to clean up the bucket file by erasing everything between the last
+   * byte written and the next byte to be read. If the secure mode is switched on, we can
+   * guarantee that no data are lost, only some might be duplicated.
+   */
+  shake_erase(wstart, rstart);
+  die("Fatal error during object pool shakedown");
 }
 
+/*** Testing ***/
+
 #ifdef TEST
-int main(void)
+
+#define COUNT 5000
+#define MAXLEN 10000
+#define KILLPERC 13
+#define LEN(i) ((259309*(i))%MAXLEN)
+
+static int test_kibitz(struct obuck_header *h, oid_t new, byte *buck)
+{
+  return 1;
+}
+
+int main(int argc, char **argv)
 {
-  int i, j;
+  int ids[COUNT];
+  unsigned int i, j, cnt;
   struct obuck_header h;
   struct fastbuf *b;
+
+  log_init(NULL);
+  if (cf_getopt(argc, argv, CF_SHORT_OPTS, CF_NO_LONG_OPTS, NULL) >= 0 ||
+      optind < argc)
+  {
+    fputs("This program supports only the following command-line arguments:\n" CF_USAGE, stderr);
+    exit(1);
+  }
+
+  unlink(obuck_name);
   obuck_init(1);
-  for(j=0; j<100; j++)
+  for(j=0; j<COUNT; j++)
     {
-      b = obuck_write();
-      for(i=0; i<100*j; i++)
-        bputc(b, i);
-      obuck_write_end(b, &h);
-      printf("%d\t%08x\t%d\n", j, h.oid, h.orig_length);
+      b = obuck_create(BUCKET_TYPE_PLAIN);
+      for(i=0; i<LEN(j); i++)
+        bputc(b, (i+j) % 256);
+      obuck_create_end(b, &h);
+      printf("Writing %08x %d\n", h.oid, h.length);
+      ids[j] = h.oid;
     }
-  obuck_delete(0);
-  b = obuck_walk_init();
-  while (b = obuck_walk_next(b, &h))
-    {
-      printf("<<< %08x\t%d\n", h.oid, h.orig_length);
-    }
-  obuck_walk_end(b);
+  for(j=0; j<COUNT; j++)
+    if (j % 100 < KILLPERC)
+      {
+       printf("Deleting %08x\n", ids[j]);
+       obuck_delete(ids[j]);
+      }
+  cnt = 0;
+  for(j=0; j<COUNT; j++)
+    if (j % 100 >= KILLPERC)
+      {
+       cnt++;
+       h.oid = ids[j];
+       obuck_find_by_oid(&h);
+       b = obuck_fetch();
+       printf("Reading %08x %d\n", h.oid, h.length);
+       if (h.length != LEN(j))
+         die("Invalid length");
+       for(i=0; i<h.length; i++)
+         if ((unsigned) bgetc(b) != (i+j) % 256)
+           die("Contents mismatch");
+       if (bgetc(b) != EOF)
+         die("EOF mismatch");
+       bclose(b);
+      }
+  obuck_shakedown(test_kibitz);
+  if (obuck_find_first(&h, 0))
+    do
+      {
+       printf("<<< %08x\t%d\n", h.oid, h.length);
+       cnt--;
+      }
+    while (obuck_find_next(&h, 0));
+  if (cnt)
+    die("Walk mismatch");
   obuck_cleanup();
   return 0;
 }
+
 #endif