2 * Sherlock Library -- Fast Database Management Routines
4 * (c) 1999 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
8 * This library uses the standard algorithm for external hashing (page directory
9 * mapping topmost K bits of hash value to page address, directory splits and
10 * so on). Peculiarities of this implementation (aka design decisions):
12 * o We allow both fixed and variable length keys and values (this includes
13 * zero size values for cases you want to represent only a set of keys).
14 * o We assume that key_size + val_size < page_size.
15 * o We never shrink the directory nor free empty pages. (The reason is that
16 * if the database was once large, it's likely it will again become large soon.)
17 * o The only pages which can be freed are those of the directory (during
18 * directory split), so we keep only a simple 32-entry free block list
19 * and we assume it's sorted.
29 #include "pagecache.h"
31 #include "db_internal.h"
34 sdbm_open(struct sdbm_options *o)
37 struct sdbm_root root, *r;
38 uns cache_size = o->cache_size ? o->cache_size : 16;
40 d = xmalloc(sizeof(struct sdbm));
43 d->fd = open(o->name, ((d->flags & SDBM_WRITE) ? O_RDWR : O_RDONLY), 0666);
44 if (d->fd >= 0) /* Already exists, let's check it */
46 if (read(d->fd, &root, sizeof(root)) != sizeof(root))
48 if (root.magic != SDBM_MAGIC || root.version != SDBM_VERSION)
50 d->file_size = lseek(d->fd, 0, SEEK_END);
51 d->page_size = 1 << root.page_order;
52 d->cache = pgc_open(d->page_size, cache_size);
53 d->root_page = pgc_read(d->cache, d->fd, 0);
54 d->root = (void *) d->root_page->data;
56 else if ((d->flags & SDBM_CREAT) && (d->fd = open(o->name, O_RDWR | O_CREAT, 0666)) >= 0)
59 uns page_order = o->page_order;
62 d->page_size = 1 << page_order;
63 d->cache = pgc_open(d->page_size, cache_size);
64 d->root_page = pgc_get_zero(d->cache, d->fd, 0);
65 r = d->root = (void *) d->root_page->data; /* Build root page */
66 r->magic = SDBM_MAGIC;
67 r->version = SDBM_VERSION;
68 r->page_order = page_order;
69 r->key_size = o->key_size;
70 r->val_size = o->val_size;
71 r->dir_start = d->page_size;
73 d->file_size = 3*d->page_size;
74 q = pgc_get_zero(d->cache, d->fd, d->page_size); /* Build page directory */
75 ((u32 *)q->data)[0] = 2*d->page_size;
77 q = pgc_get_zero(d->cache, d->fd, 2*d->page_size); /* Build single data page */
82 d->dir_size = 1 << d->root->dir_order;
83 d->dir_shift = 32 - d->root->dir_order;
84 d->page_order = d->root->page_order;
85 d->page_mask = d->page_size - 1;
86 d->key_size = d->root->key_size;
87 d->val_size = d->root->val_size;
96 sdbm_close(struct sdbm *d)
99 pgc_put(d->cache, d->root_page);
108 sdbm_alloc_pages(struct sdbm *d, uns number)
110 uns where = d->file_size;
111 d->file_size += number << d->page_order;
116 sdbm_alloc_page(struct sdbm *d)
120 if (!d->root->free_pool[0].count)
121 return sdbm_alloc_pages(d, 1);
122 pos = d->root->free_pool[0].first;
123 d->root->free_pool[0].first += d->page_size;
124 if (!--d->root->free_pool[0].count)
126 memmove(d->root->free_pool, d->root->free_pool+1, SDBM_NUM_FREE_PAGE_POOLS * sizeof(d->root->free_pool[0]));
127 d->root->free_pool[SDBM_NUM_FREE_PAGE_POOLS-1].count = 0;
129 pgc_mark_dirty(d->cache, d->root_page);
134 sdbm_free_pages(struct sdbm *d, uns start, uns number)
138 while (d->root->free_pool[i].count)
140 d->root->free_pool[i].first = start;
141 d->root->free_pool[i].count = number;
142 pgc_mark_dirty(d->cache, d->root_page);
146 sdbm_hash(byte *key, uns keylen)
149 * This is the same hash function as GDBM uses.
150 * It seems to work well.
155 /* Set the initial value from key. */
156 value = 0x238F13AF * keylen;
157 for (index = 0; index < keylen; index++)
158 value = value + (key[index] << (index*5 % 24));
159 return (1103515243 * value + 12345);
163 sdbm_get_entry(struct sdbm *d, byte *pos, byte **key, uns *keylen, byte **val, uns *vallen)
167 if (d->key_size >= 0)
168 *keylen = d->key_size;
171 *keylen = (p[0] << 8) | p[1];
176 if (d->val_size >= 0)
177 *vallen = d->val_size;
180 *vallen = (p[0] << 8) | p[1];
189 sdbm_entry_len(struct sdbm *d, uns keylen, uns vallen)
191 uns len = keylen + vallen;
200 sdbm_store_entry(struct sdbm *d, byte *pos, byte *key, uns keylen, byte *val, uns vallen)
204 *pos++ = keylen >> 8;
207 memmove(pos, key, keylen);
211 *pos++ = vallen >> 8;
214 memmove(pos, val, vallen);
218 sdbm_page_rank(struct sdbm *d, uns dirpos)
223 uns pm = d->page_mask;
225 b = pgc_read(d->cache, d->fd, d->root->dir_start + (dirpos & ~pm));
226 pg = GET32(b->data, dirpos & pm);
228 while ((l & pm) && GET32(b->data, (l - 4) & pm) == pg)
231 /* We heavily depend on unused directory entries being zero */
232 while ((r & pm) && GET32(b->data, r & pm) == pg)
234 pgc_put(d->cache, b);
236 if (!(l & pm) && !(r & pm))
238 /* Note that if it spans page boundary, it must contain an integer number of pages */
241 b = pgc_read(d->cache, d->fd, d->root->dir_start + ((l - 4) & ~pm));
242 x = GET32(b->data, 0);
243 pgc_put(d->cache, b);
248 while (r < 4*d->dir_size)
250 b = pgc_read(d->cache, d->fd, d->root->dir_start + (r & ~pm));
251 x = GET32(b->data, 0);
252 pgc_put(d->cache, b);
262 sdbm_expand_directory(struct sdbm *d)
268 if (4*d->dir_size < d->page_size)
270 /* It still fits within single page */
271 b = pgc_read(d->cache, d->fd, d->root->dir_start);
272 dir = (u32 *) b->data;
273 for(i=d->dir_size-1; i>=0; i--)
274 dir[2*i] = dir[2*i+1] = dir[i];
275 pgc_mark_dirty(d->cache, b);
276 pgc_put(d->cache, b);
280 uns old_dir = d->root->dir_start;
281 uns old_dir_pages = 1 << (d->root->dir_order + 2 - d->page_order);
283 new_dir = d->root->dir_start = sdbm_alloc_pages(d, 2*old_dir_pages);
284 ent = 1 << (d->page_order - 3);
285 for(page=0; page < old_dir_pages; page++)
287 b = pgc_read(d->cache, d->fd, old_dir + (page << d->page_order));
288 dir = (u32 *) b->data;
289 c = pgc_get(d->cache, d->fd, new_dir + (page << (d->page_order + 1)));
292 t[2*i] = t[2*i+1] = dir[i];
293 pgc_put(d->cache, c);
294 c = pgc_get(d->cache, d->fd, new_dir + (page << (d->page_order + 1)) + d->page_size);
297 t[2*i] = t[2*i+1] = dir[ent+i];
298 pgc_put(d->cache, c);
299 pgc_put(d->cache, b);
301 if (!(d->flags & SDBM_FAST))
304 * Unless in super-fast mode, fill old directory pages with zeroes.
305 * This slows us down a bit, but allows database reconstruction after
306 * the free list is lost.
308 for(page=0; page < old_dir_pages; page++)
310 b = pgc_get_zero(d->cache, d->fd, old_dir + (page << d->page_order));
311 pgc_put(d->cache, b);
314 sdbm_free_pages(d, old_dir, old_dir_pages);
317 d->root->dir_order++;
318 d->dir_size = 1 << d->root->dir_order;
319 d->dir_shift = 32 - d->root->dir_order;
320 pgc_mark_dirty(d->cache, d->root_page);
321 if (!(d->flags & SDBM_FAST))
326 sdbm_split_data(struct sdbm *d, struct sdbm_bucket *s, struct sdbm_bucket *d0, struct sdbm_bucket *d1, uns sigbit)
329 byte *dp[2] = { d0->data, d1->data };
333 while (sp < s->data + s->used)
335 sz = sdbm_get_entry(d, sp, &K, &Kl, &D, &Dl);
337 i = (sdbm_hash(K, Kl) & (1 << sigbit)) ? 1 : 0;
338 sdbm_store_entry(d, dp[i], K, Kl, D, Dl);
341 d0->used = dp[0] - d0->data;
342 d1->used = dp[1] - d1->data;
346 sdbm_split_dir(struct sdbm *d, uns dirpos, uns count, uns pos)
354 b = pgc_read(d->cache, d->fd, d->root->dir_start + (dirpos & ~d->page_mask));
355 i = d->page_size - (dirpos & d->page_mask);
361 GET32(b->data, dirpos & d->page_mask) = pos;
365 pgc_mark_dirty(d->cache, b);
366 pgc_put(d->cache, b);
371 sdbm_split_page(struct sdbm *d, struct page *b, u32 hash, uns dirpos)
374 uns i, rank, sigbit, rank_log;
376 rank = sdbm_page_rank(d, dirpos); /* rank = # of pointers to this page */
379 sdbm_expand_directory(d);
383 rank_log = 1; /* rank_log = log2(rank) */
384 while ((1U << rank_log) < rank)
386 sigbit = d->dir_shift + rank_log - 1; /* sigbit = bit we split on */
388 p[1] = pgc_get(d->cache, d->fd, sdbm_alloc_page(d));
389 sdbm_split_data(d, (void *) b->data, (void *) p[0]->data, (void *) p[1]->data, sigbit);
390 sdbm_split_dir(d, (dirpos & ~(4*rank - 1))+2*rank, rank/2, p[1]->pos);
391 pgc_mark_dirty(d->cache, p[0]);
392 i = (hash & (1 << sigbit)) ? 1 : 0;
393 pgc_put(d->cache, p[!i]);
398 sdbm_put_user(byte *D, uns Dl, byte *val, uns *vallen)
412 sdbm_access(struct sdbm *d, byte *key, uns keylen, byte *val, uns *vallen, uns mode) /* 0=read, 1=store, 2=replace */
415 u32 hash, h, pos, size;
416 struct sdbm_bucket *b;
420 if ((d->key_size >= 0 && keylen != (uns) d->key_size) || keylen > 65535)
421 return SDBM_ERROR_BAD_KEY_SIZE;
422 if (val && ((d->val_size >= 0 && *vallen != (uns) d->val_size) || *vallen >= 65535) && mode)
423 return SDBM_ERROR_BAD_VAL_SIZE;
424 if (!mode && !(d->flags & SDBM_WRITE))
425 return SDBM_ERROR_READ_ONLY;
426 hash = sdbm_hash(key, keylen);
427 if (d->dir_shift != 32) /* avoid shifting by 32 bits */
428 h = (hash >> d->dir_shift) << 2; /* offset in the directory */
431 p = pgc_read(d->cache, d->fd, d->root->dir_start + (h & ~d->page_mask));
432 pos = GET32(p->data, h & d->page_mask);
433 pgc_put(d->cache, p);
434 q = pgc_read(d->cache, d->fd, pos);
435 b = (void *) q->data;
442 s = sdbm_get_entry(d, c, &K, &Kl, &D, &Dl);
443 if (Kl == keylen && !memcmp(K, key, Kl))
448 case 0: /* fetch: found */
449 rc = sdbm_put_user(D, Dl, val, vallen);
450 pgc_put(d->cache, q);
451 return rc ? SDBM_ERROR_TOO_LARGE : 1;
452 case 1: /* store: already present */
453 pgc_put(d->cache, q);
455 default: /* replace: delete the old one */
456 memmove(c, c+s, e-(c+s));
463 if (!mode || !val) /* fetch or delete: no success */
465 pgc_put(d->cache, q);
472 size = sdbm_entry_len(d, keylen, *vallen);
473 while (b->used + size > d->page_size - sizeof(struct sdbm_bucket))
475 /* Page overflow, need to split */
476 if (size >= d->page_size - sizeof(struct sdbm_bucket))
478 pgc_put(d->cache, q);
479 return SDBM_ERROR_GIANT;
481 q = sdbm_split_page(d, q, hash, h);
482 b = (void *) q->data;
484 sdbm_store_entry(d, b->data + b->used, key, keylen, val, *vallen);
487 pgc_mark_dirty(d->cache, q);
488 pgc_put(d->cache, q);
489 if (d->flags & SDBM_SYNC)
495 sdbm_store(struct sdbm *d, byte *key, uns keylen, byte *val, uns vallen)
497 return sdbm_access(d, key, keylen, val, &vallen, 1);
501 sdbm_replace(struct sdbm *d, byte *key, uns keylen, byte *val, uns vallen)
503 return sdbm_access(d, key, keylen, val, &vallen, 2);
507 sdbm_delete(struct sdbm *d, byte *key, uns keylen)
509 return sdbm_access(d, key, keylen, NULL, NULL, 2);
513 sdbm_fetch(struct sdbm *d, byte *key, uns keylen, byte *val, uns *vallen)
515 return sdbm_access(d, key, keylen, val, vallen, 0);
519 sdbm_rewind(struct sdbm *d)
521 d->find_pos = d->page_size;
522 d->find_free_list = 0;
526 sdbm_get_next(struct sdbm *d, byte *key, uns *keylen, byte *val, uns *vallen)
528 uns pos = d->find_pos;
532 struct sdbm_bucket *b;
536 c = pos & d->page_mask;
539 if (pos >= d->file_size)
541 if (pos == d->root->dir_start)
542 pos += (4*d->dir_size + d->page_size - 1) & ~d->page_mask;
543 else if (pos == d->root->free_pool[d->find_free_list].first)
544 pos += d->root->free_pool[d->find_free_list++].count << d->page_order;
549 p = pgc_read(d->cache, d->fd, pos & ~d->page_mask);
550 b = (void *) p->data;
551 if (c - 4 >= b->used)
553 pos = (pos & ~d->page_mask) + d->page_size;
554 pgc_put(d->cache, p);
557 c = sdbm_get_entry(d, p->data + c, &K, &Kl, &V, &Vl);
558 d->find_pos = pos + c;
559 c = sdbm_put_user(K, Kl, key, keylen) ||
560 sdbm_put_user(V, Vl, val, vallen);
561 pgc_put(d->cache, p);
562 return c ? SDBM_ERROR_TOO_LARGE : 1;
569 sdbm_sync(struct sdbm *d)
572 if (d->flags & SDBM_FSYNC)