2 * UCW Library -- Fast Database Management Routines
4 * (c) 1999--2001 Martin Mares <mj@ucw.cz>
6 * This software may be freely distributed and used according to the terms
7 * of the GNU Lesser General Public License.
11 * This library uses the standard algorithm for external hashing (page directory
12 * mapping topmost K bits of hash value to page address, directory splits and
13 * so on). Peculiarities of this implementation (aka design decisions):
15 * o We allow both fixed and variable length keys and values (this includes
16 * zero size values for cases you want to represent only a set of keys).
17 * o We assume that key_size + val_size < page_size.
18 * o We never shrink the directory nor free empty pages. (The reason is that
19 * if the database was once large, it's likely it will again become large soon.)
20 * o The only pages which can be freed are those of the directory (during
21 * directory split), so we keep only a simple 32-entry free block list
22 * and we assume it's sorted.
23 * o All pointers are always given in pages from start of the file.
24 * This gives us page_size*2^32 limit for file size which should be enough.
29 #include "ucw/pagecache.h"
31 #include "ucw/db_internal.h"
38 #define GET_PAGE(d,x) pgc_get((d)->cache, (d)->fd, ((sh_off_t)(x)) << (d)->page_order)
39 #define GET_ZERO_PAGE(d,x) pgc_get_zero((d)->cache, (d)->fd, ((sh_off_t)(x)) << (d)->page_order)
40 #define READ_PAGE(d,x) pgc_read((d)->cache, (d)->fd, ((sh_off_t)(x)) << (d)->page_order)
41 #define READ_DIR(d,off) pgc_read((d)->cache, (d)->fd, (((sh_off_t)(d)->root->dir_start) << (d)->page_order) + (off))
44 sdbm_open(struct sdbm_options *o)
47 struct sdbm_root root, *r;
48 uns cache_size = o->cache_size ? o->cache_size : 16;
50 d = xmalloc_zero(sizeof(struct sdbm));
52 d->fd = sh_open(o->name, ((d->flags & SDBM_WRITE) ? O_RDWR : O_RDONLY), 0666);
53 if (d->fd >= 0) /* Already exists, let's check it */
55 if (read(d->fd, &root, sizeof(root)) != sizeof(root))
57 if (root.magic != SDBM_MAGIC || root.version != SDBM_VERSION)
59 d->file_size = sh_seek(d->fd, 0, SEEK_END) >> root.page_order;
60 d->page_order = root.page_order;
61 d->page_size = 1 << root.page_order;
62 d->cache = pgc_open(d->page_size, cache_size);
63 d->root_page = pgc_read(d->cache, d->fd, 0);
64 d->root = (void *) d->root_page->data;
66 else if ((d->flags & SDBM_CREAT) && (d->fd = sh_open(o->name, O_RDWR | O_CREAT, 0666)) >= 0)
69 uns page_order = o->page_order;
72 d->page_size = 1 << page_order;
73 d->cache = pgc_open(d->page_size, cache_size);
74 d->root_page = GET_ZERO_PAGE(d, 0);
75 r = d->root = (void *) d->root_page->data; /* Build root page */
76 r->magic = SDBM_MAGIC;
77 r->version = SDBM_VERSION;
78 r->page_order = d->page_order = page_order;
79 r->key_size = o->key_size;
80 r->val_size = o->val_size;
84 q = GET_ZERO_PAGE(d, 1); /* Build page directory */
85 GET32(q->data, 0) = 2;
87 q = GET_ZERO_PAGE(d, 2); /* Build single data page */
92 d->dir_size = 1 << d->root->dir_order;
93 d->dir_shift = 32 - d->root->dir_order;
94 d->page_mask = d->page_size - 1;
95 d->key_size = d->root->key_size;
96 d->val_size = d->root->val_size;
105 sdbm_close(struct sdbm *d)
108 pgc_put(d->cache, d->root_page);
117 sdbm_alloc_pages(struct sdbm *d, uns number)
119 uns where = d->file_size;
120 if (where + number < where) /* Wrap around? */
121 die("SDB: Database file too large, giving up");
122 d->file_size += number;
127 sdbm_alloc_page(struct sdbm *d)
131 if (!d->root->free_pool[0].count)
132 return sdbm_alloc_pages(d, 1);
133 pos = d->root->free_pool[0].first;
134 d->root->free_pool[0].first++;
135 if (!--d->root->free_pool[0].count)
137 memmove(d->root->free_pool, d->root->free_pool+1, (SDBM_NUM_FREE_PAGE_POOLS-1) * sizeof(d->root->free_pool[0]));
138 d->root->free_pool[SDBM_NUM_FREE_PAGE_POOLS-1].count = 0;
140 pgc_mark_dirty(d->cache, d->root_page);
145 sdbm_free_pages(struct sdbm *d, uns start, uns number)
149 while (d->root->free_pool[i].count)
151 ASSERT(i < SDBM_NUM_FREE_PAGE_POOLS);
152 d->root->free_pool[i].first = start;
153 d->root->free_pool[i].count = number;
154 pgc_mark_dirty(d->cache, d->root_page);
158 sdbm_hash(byte *key, uns keylen)
161 * This used to be the same hash function as GDBM uses,
162 * but it turned out that it tends to give the same results
163 * on similar keys. Damn it.
165 u32 value = 0x238F13AF * keylen;
167 value = 37*value + *key++;
168 return (1103515243 * value + 12345);
172 sdbm_get_entry(struct sdbm *d, byte *pos, byte **key, uns *keylen, byte **val, uns *vallen)
176 if (d->key_size >= 0)
177 *keylen = d->key_size;
180 *keylen = (p[0] << 8) | p[1];
185 if (d->val_size >= 0)
186 *vallen = d->val_size;
189 *vallen = (p[0] << 8) | p[1];
198 sdbm_entry_len(struct sdbm *d, uns keylen, uns vallen)
200 uns len = keylen + vallen;
209 sdbm_store_entry(struct sdbm *d, byte *pos, byte *key, uns keylen, byte *val, uns vallen)
213 *pos++ = keylen >> 8;
216 memmove(pos, key, keylen);
220 *pos++ = vallen >> 8;
223 memmove(pos, val, vallen);
227 sdbm_page_rank(struct sdbm *d, uns dirpos)
232 uns pm = d->page_mask;
234 b = READ_DIR(d, dirpos & ~pm);
235 pg = GET32(b->data, dirpos & pm);
237 while ((l & pm) && GET32(b->data, (l - 4) & pm) == pg)
240 /* We heavily depend on unused directory entries being zero */
241 while ((r & pm) && GET32(b->data, r & pm) == pg)
243 pgc_put(d->cache, b);
245 if (!(l & pm) && !(r & pm))
247 /* Note that if it spans page boundary, it must contain an integer number of pages */
250 b = READ_DIR(d, (l - 4) & ~pm);
251 x = GET32(b->data, 0);
252 pgc_put(d->cache, b);
257 while (r < 4*d->dir_size)
259 b = READ_DIR(d, r & ~pm);
260 x = GET32(b->data, 0);
261 pgc_put(d->cache, b);
271 sdbm_expand_directory(struct sdbm *d)
277 if (d->root->dir_order >= 31)
278 die("SDB: Database directory too large, giving up");
280 if (4*d->dir_size < d->page_size)
282 /* It still fits within single page */
284 dir = (u32 *) b->data;
285 for(i=d->dir_size-1; i>=0; i--)
286 dir[2*i] = dir[2*i+1] = dir[i];
287 pgc_mark_dirty(d->cache, b);
288 pgc_put(d->cache, b);
292 uns old_dir = d->root->dir_start;
293 uns old_dir_pages = 1 << (d->root->dir_order + 2 - d->page_order);
295 new_dir = d->root->dir_start = sdbm_alloc_pages(d, 2*old_dir_pages);
296 ent = 1 << (d->page_order - 3);
297 for(page=0; page < old_dir_pages; page++)
299 b = READ_PAGE(d, old_dir + page);
300 dir = (u32 *) b->data;
301 c = GET_PAGE(d, new_dir + 2*page);
304 t[2*i] = t[2*i+1] = dir[i];
305 pgc_put(d->cache, c);
306 c = GET_PAGE(d, new_dir + 2*page + 1);
309 t[2*i] = t[2*i+1] = dir[ent+i];
310 pgc_put(d->cache, c);
311 pgc_put(d->cache, b);
313 if (!(d->flags & SDBM_FAST))
316 * Unless in super-fast mode, fill old directory pages with zeroes.
317 * This slows us down a bit, but allows database reconstruction after
318 * the free list is lost.
320 for(page=0; page < old_dir_pages; page++)
322 b = GET_ZERO_PAGE(d, old_dir + page);
323 pgc_put(d->cache, b);
326 sdbm_free_pages(d, old_dir, old_dir_pages);
329 d->root->dir_order++;
330 d->dir_size = 1 << d->root->dir_order;
331 d->dir_shift = 32 - d->root->dir_order;
332 pgc_mark_dirty(d->cache, d->root_page);
333 if (!(d->flags & SDBM_FAST))
338 sdbm_split_data(struct sdbm *d, struct sdbm_bucket *s, struct sdbm_bucket *d0, struct sdbm_bucket *d1, uns sigbit)
341 byte *dp[2] = { d0->data, d1->data };
345 while (sp < s->data + s->used)
347 sz = sdbm_get_entry(d, sp, &K, &Kl, &D, &Dl);
349 i = (sdbm_hash(K, Kl) & (1 << sigbit)) ? 1 : 0;
350 sdbm_store_entry(d, dp[i], K, Kl, D, Dl);
353 d0->used = dp[0] - d0->data;
354 d1->used = dp[1] - d1->data;
358 sdbm_split_dir(struct sdbm *d, uns dirpos, uns count, uns pos)
366 b = READ_DIR(d, dirpos & ~d->page_mask);
367 i = d->page_size - (dirpos & d->page_mask);
373 GET32(b->data, dirpos & d->page_mask) = pos;
377 pgc_mark_dirty(d->cache, b);
378 pgc_put(d->cache, b);
383 sdbm_dirpos(struct sdbm *d, uns hash)
385 if (d->dir_shift != 32) /* avoid shifting by 32 bits */
386 return (hash >> d->dir_shift) << 2; /* offset in the directory */
392 sdbm_split_page(struct sdbm *d, struct page *b, u32 hash)
395 uns i, rank, sigbit, rank_log, dirpos, newpg;
397 dirpos = sdbm_dirpos(d, hash);
398 rank = sdbm_page_rank(d, dirpos); /* rank = # of pointers to this page */
401 sdbm_expand_directory(d);
405 rank_log = 1; /* rank_log = log2(rank) */
406 while ((1U << rank_log) < rank)
408 sigbit = d->dir_shift + rank_log - 1; /* sigbit = bit we split on */
410 newpg = sdbm_alloc_page(d);
411 p[1] = GET_PAGE(d, newpg);
412 sdbm_split_data(d, (void *) b->data, (void *) p[0]->data, (void *) p[1]->data, sigbit);
413 sdbm_split_dir(d, (dirpos & ~(4*rank - 1))+2*rank, rank/2, newpg);
414 pgc_mark_dirty(d->cache, p[0]);
415 i = (hash & (1 << sigbit)) ? 1 : 0;
416 pgc_put(d->cache, p[!i]);
421 sdbm_put_user(byte *D, uns Dl, byte *val, uns *vallen)
435 sdbm_access(struct sdbm *d, byte *key, uns keylen, byte *val, uns *vallen, uns mode) /* 0=read, 1=store, 2=replace */
438 u32 hash, h, pos, size;
439 struct sdbm_bucket *b;
443 if ((d->key_size >= 0 && keylen != (uns) d->key_size) || keylen > 65535)
444 return SDBM_ERROR_BAD_KEY_SIZE;
445 if (val && ((d->val_size >= 0 && *vallen != (uns) d->val_size) || *vallen >= 65535) && mode)
446 return SDBM_ERROR_BAD_VAL_SIZE;
447 if (!mode && !(d->flags & SDBM_WRITE))
448 return SDBM_ERROR_READ_ONLY;
449 hash = sdbm_hash(key, keylen);
450 h = sdbm_dirpos(d, hash);
451 p = READ_DIR(d, h & ~d->page_mask);
452 pos = GET32(p->data, h & d->page_mask);
453 pgc_put(d->cache, p);
454 q = READ_PAGE(d, pos);
455 b = (void *) q->data;
462 s = sdbm_get_entry(d, c, &K, &Kl, &D, &Dl);
463 if (Kl == keylen && !memcmp(K, key, Kl))
468 case 0: /* fetch: found */
469 rc = sdbm_put_user(D, Dl, val, vallen);
470 pgc_put(d->cache, q);
471 return rc ? SDBM_ERROR_TOO_LARGE : 1;
472 case 1: /* store: already present */
473 pgc_put(d->cache, q);
475 default: /* replace: delete the old one */
476 memmove(c, c+s, e-(c+s));
483 if (!mode || !val) /* fetch or delete: no success */
485 pgc_put(d->cache, q);
492 size = sdbm_entry_len(d, keylen, *vallen);
493 while (b->used + size > d->page_size - sizeof(struct sdbm_bucket))
495 /* Page overflow, need to split */
496 if (size >= d->page_size - sizeof(struct sdbm_bucket))
498 pgc_put(d->cache, q);
499 return SDBM_ERROR_GIANT;
501 q = sdbm_split_page(d, q, hash);
502 b = (void *) q->data;
504 sdbm_store_entry(d, b->data + b->used, key, keylen, val, *vallen);
507 pgc_mark_dirty(d->cache, q);
508 pgc_put(d->cache, q);
509 if (d->flags & SDBM_SYNC)
515 sdbm_store(struct sdbm *d, byte *key, uns keylen, byte *val, uns vallen)
517 return sdbm_access(d, key, keylen, val, &vallen, 1);
521 sdbm_replace(struct sdbm *d, byte *key, uns keylen, byte *val, uns vallen)
523 return sdbm_access(d, key, keylen, val, &vallen, 2);
527 sdbm_delete(struct sdbm *d, byte *key, uns keylen)
529 return sdbm_access(d, key, keylen, NULL, NULL, 2);
533 sdbm_fetch(struct sdbm *d, byte *key, uns keylen, byte *val, uns *vallen)
535 return sdbm_access(d, key, keylen, val, vallen, 0);
539 sdbm_rewind(struct sdbm *d)
543 d->find_free_list = 0;
547 sdbm_get_next(struct sdbm *d, byte *key, uns *keylen, byte *val, uns *vallen)
549 uns page = d->find_page;
550 uns pos = d->find_pos;
554 struct sdbm_bucket *b;
560 if (page >= d->file_size)
562 if (page == d->root->dir_start)
563 page += (4*d->dir_size + d->page_size - 1) >> d->page_order;
564 else if (page == d->root->free_pool[d->find_free_list].first)
565 page += d->root->free_pool[d->find_free_list++].count;
570 p = READ_PAGE(d, page);
571 b = (void *) p->data;
572 if (pos - 4 >= b->used)
576 pgc_put(d->cache, p);
579 c = sdbm_get_entry(d, p->data + pos, &K, &Kl, &V, &Vl);
581 d->find_pos = pos + c;
582 c = sdbm_put_user(K, Kl, key, keylen) ||
583 sdbm_put_user(V, Vl, val, vallen);
584 pgc_put(d->cache, p);
585 return c ? SDBM_ERROR_TOO_LARGE : 1;
593 sdbm_sync(struct sdbm *d)
596 if (d->flags & SDBM_FSYNC)