From: Martin Mares Date: Mon, 15 Nov 1999 10:32:06 +0000 (+0000) Subject: Added prototype version of the fast database management library. X-Git-Tag: holmes-import~1648 X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=a9397765ec1fc7102c080ed072173ce0ee156050;p=libucw.git Added prototype version of the fast database management library. --- diff --git a/lib/db.c b/lib/db.c new file mode 100644 index 00000000..c3d2c886 --- /dev/null +++ b/lib/db.c @@ -0,0 +1,576 @@ +/* + * Sherlock Library -- Fast Database Management Routines + * + * (c) 1999 Martin Mares + */ + +/* + * This library uses the standard algorithm for external hashing (page directory + * mapping topmost K bits of hash value to page address, directory splits and + * so on). Peculiarities of this implementation (aka design decisions): + * + * o We allow both fixed and variable length keys and values (this includes + * zero size values for cases you want to represent only a set of keys). + * o We assume that key_size + val_size < page_size. + * o We never shrink the directory nor free empty pages. (The reason is that + * if the database was once large, it's likely it will again become large soon.) + * o The only pages which can be freed are those of the directory (during + * directory split), so we keep only a simple 32-entry free block list + * and we assume it's sorted. + */ + +#include +#include +#include +#include +#include + +#include "lib.h" +#include "pagecache.h" +#include "db.h" +#include "db_internal.h" + +struct sdbm * +sdbm_open(struct sdbm_options *o) +{ + struct sdbm *d; + struct sdbm_root root, *r; + uns cache_size = o->cache_size ? o->cache_size : 16; + + d = xmalloc(sizeof(struct sdbm)); + bzero(d, sizeof(*d)); + d->flags = o->flags; + d->fd = open(o->name, ((d->flags & SDBM_WRITE) ? O_RDWR : O_RDONLY), 0666); + if (d->fd >= 0) /* Already exists, let's check it */ + { + if (read(d->fd, &root, sizeof(root)) != sizeof(root)) + goto bad; + if (root.magic != SDBM_MAGIC || root.version != SDBM_VERSION) + goto bad; + d->file_size = lseek(d->fd, 0, SEEK_END); + d->page_size = 1 << root.page_order; + d->cache = pgc_open(d->page_size, cache_size); + d->root_page = pgc_read(d->cache, d->fd, 0); + d->root = (void *) d->root_page->data; + /* FIXME: Should we do some locking? */ + } + else if ((d->flags & SDBM_CREAT) && (d->fd = open(o->name, O_RDWR | O_CREAT, 0666)) >= 0) + { + struct page *q; + uns page_order = o->page_order; + if (page_order < 10) + page_order = 10; + d->page_size = 1 << page_order; + d->cache = pgc_open(d->page_size, cache_size); + d->root_page = pgc_get_zero(d->cache, d->fd, 0); + r = d->root = (void *) d->root_page->data; /* Build root page */ + r->magic = SDBM_MAGIC; + r->version = SDBM_VERSION; + r->page_order = page_order; + r->key_size = o->key_size; + r->val_size = o->val_size; + r->dir_start = d->page_size; + r->dir_order = 0; + d->file_size = 3*d->page_size; + q = pgc_get_zero(d->cache, d->fd, d->page_size); /* Build page directory */ + ((u32 *)q->data)[0] = 2*d->page_size; + pgc_put(d->cache, q); + q = pgc_get_zero(d->cache, d->fd, 2*d->page_size); /* Build single data page */ + pgc_put(d->cache, q); + } + else + goto bad; + d->dir_size = 1 << d->root->dir_order; + d->dir_shift = 32 - d->root->dir_order; + d->page_order = d->root->page_order; + d->page_mask = d->page_size - 1; + d->key_size = d->root->key_size; + d->val_size = d->root->val_size; + return d; + +bad: + sdbm_close(d); + return NULL; +} + +void +sdbm_close(struct sdbm *d) +{ + if (d->root_page) + pgc_put(d->cache, d->root_page); + if (d->cache) + pgc_close(d->cache); + if (d->fd >= 0) + close(d->fd); + free(d); +} + +static uns +sdbm_alloc_pages(struct sdbm *d, uns number) +{ + uns where = d->file_size; + d->file_size += number << d->page_order; + return where; +} + +static uns +sdbm_alloc_page(struct sdbm *d) +{ + uns pos; + + if (!d->root->free_pool[0].count) + return sdbm_alloc_pages(d, 1); + pos = d->root->free_pool[0].first; + d->root->free_pool[0].first += d->page_size; + if (!--d->root->free_pool[0].count) + { + memmove(d->root->free_pool, d->root->free_pool+1, SDBM_NUM_FREE_PAGE_POOLS * sizeof(d->root->free_pool[0])); + d->root->free_pool[SDBM_NUM_FREE_PAGE_POOLS-1].count = 0; + } + pgc_mark_dirty(d->cache, d->root_page); + return pos; +} + +static void +sdbm_free_pages(struct sdbm *d, uns start, uns number) +{ + uns i = 0; + + while (d->root->free_pool[i].count) + i++; + d->root->free_pool[i].first = start; + d->root->free_pool[i].count = number; + pgc_mark_dirty(d->cache, d->root_page); +} + +static u32 +sdbm_hash(byte *key, uns keylen) +{ + /* + * This is the same hash function as GDBM uses. + * FIXME: Use a faster one? + */ + u32 value; + int index; + + /* Set the initial value from key. */ + value = 0x238F13AF * keylen; + for (index = 0; index < keylen; index++) + value = value + (key[index] << (index*5 % 24)); + return (1103515243 * value + 12345); +} + +static int +sdbm_get_entry(struct sdbm *d, byte *pos, byte **key, uns *keylen, byte **val, uns *vallen) +{ + byte *p = pos; + + if (d->key_size >= 0) + *keylen = d->key_size; + else + { + *keylen = (p[0] << 8) | p[1]; + p += 2; + } + *key = p; + p += *keylen; + if (d->val_size >= 0) + *vallen = d->val_size; + else + { + *vallen = (p[0] << 8) | p[1]; + p += 2; + } + *val = p; + p += *vallen; + return p - pos; +} + +static int +sdbm_entry_len(struct sdbm *d, uns keylen, uns vallen) +{ + uns len = keylen + vallen; + if (d->key_size < 0) + len += 2; + if (d->val_size < 0) + len += 2; + return len; +} + +static void +sdbm_store_entry(struct sdbm *d, byte *pos, byte *key, uns keylen, byte *val, uns vallen) +{ + if (d->key_size < 0) + { + *pos++ = keylen >> 8; + *pos++ = keylen; + } + memmove(pos, key, keylen); + pos += keylen; + if (d->val_size < 0) + { + *pos++ = vallen >> 8; + *pos++ = vallen; + } + memmove(pos, val, vallen); +} + +static uns +sdbm_page_rank(struct sdbm *d, uns dirpos) +{ + struct page *b; + u32 pg, x; + uns l, r; + uns pm = d->page_mask; + + b = pgc_read(d->cache, d->fd, d->root->dir_start + (dirpos & ~pm)); + pg = GET32(b->data, dirpos & pm); + l = dirpos; + while ((l & pm) && GET32(b->data, (l - 4) & pm) == pg) + l -= 4; + r = dirpos + 4; + /* We heavily depend on unused directory entries being zero */ + while ((r & pm) && GET32(b->data, r & pm) == pg) + r += 4; + pgc_put(d->cache, b); + + if (!(l & pm) && !(r & pm)) + { + /* Note that if it spans page boundary, it must contain an integer number of pages */ + while (l) + { + b = pgc_read(d->cache, d->fd, d->root->dir_start + ((l - 4) & ~pm)); + x = GET32(b->data, 0); + pgc_put(d->cache, b); + if (x != pg) + break; + l -= d->page_size; + } + while (r < 4*d->dir_size) + { + b = pgc_read(d->cache, d->fd, d->root->dir_start + (r & ~pm)); + x = GET32(b->data, 0); + pgc_put(d->cache, b); + if (x != pg) + break; + r += d->page_size; + } + } + return (r - l) >> 2; +} + +static void +sdbm_expand_directory(struct sdbm *d) +{ + struct page *b, *c; + int i; + u32 *dir, *t; + + if (4*d->dir_size < d->page_size) + { + /* It still fits within single page */ + b = pgc_read(d->cache, d->fd, d->root->dir_start); + dir = (u32 *) b->data; + for(i=d->dir_size-1; i>=0; i--) + dir[2*i] = dir[2*i+1] = dir[i]; + pgc_mark_dirty(d->cache, b); + pgc_put(d->cache, b); + } + else + { + uns old_dir = d->root->dir_start; + uns old_dir_pages = 1 << (d->root->dir_order + 2 - d->page_order); + uns page, ent, new_dir; + new_dir = d->root->dir_start = sdbm_alloc_pages(d, 2*old_dir_pages); + ent = 1 << (d->page_order - 3); + for(page=0; page < old_dir_pages; page++) + { + b = pgc_read(d->cache, d->fd, old_dir + (page << d->page_order)); + dir = (u32 *) b->data; + c = pgc_get(d->cache, d->fd, new_dir + (page << (d->page_order + 1))); + t = (u32 *) c->data; + for(i=0; icache, c); + c = pgc_get(d->cache, d->fd, new_dir + (page << (d->page_order + 1)) + d->page_size); + t = (u32 *) c->data; + for(i=0; icache, c); + pgc_put(d->cache, b); + } + if (!(d->flags & SDBM_FAST)) + { + /* + * Unless in super-fast mode, fill old directory pages with zeroes. + * This slows us down a bit, but allows database reconstruction after + * the free list is lost. + */ + for(page=0; page < old_dir_pages; page++) + { + b = pgc_get_zero(d->cache, d->fd, old_dir + (page << d->page_order)); + pgc_put(d->cache, b); + } + } + sdbm_free_pages(d, old_dir, old_dir_pages); + } + + d->root->dir_order++; + d->dir_size = 1 << d->root->dir_order; + d->dir_shift = 32 - d->root->dir_order; + pgc_mark_dirty(d->cache, d->root_page); + if (!(d->flags & SDBM_FAST)) + sdbm_sync(d); +} + +static void +sdbm_split_data(struct sdbm *d, struct sdbm_bucket *s, struct sdbm_bucket *d0, struct sdbm_bucket *d1, uns sigbit) +{ + byte *sp = s->data; + byte *dp[2] = { d0->data, d1->data }; + byte *K, *D; + uns Kl, Dl, sz, i; + + while (sp < s->data + s->used) + { + sz = sdbm_get_entry(d, sp, &K, &Kl, &D, &Dl); + sp += sz; + i = (sdbm_hash(K, Kl) & (1 << sigbit)) ? 1 : 0; + sdbm_store_entry(d, dp[i], K, Kl, D, Dl); + dp[i] += sz; + } + d0->used = dp[0] - d0->data; + d1->used = dp[1] - d1->data; +} + +static void +sdbm_split_dir(struct sdbm *d, uns dirpos, uns count, uns pos) +{ + struct page *b; + uns i; + + count *= 4; + while (count) + { + b = pgc_read(d->cache, d->fd, d->root->dir_start + (dirpos & ~d->page_mask)); + i = d->page_size - (dirpos & d->page_mask); + if (i > count) + i = count; + count -= i; + while (i) + { + GET32(b->data, dirpos & d->page_mask) = pos; + dirpos += 4; + i -= 4; + } + pgc_mark_dirty(d->cache, b); + pgc_put(d->cache, b); + } +} + +static struct page * +sdbm_split_page(struct sdbm *d, struct page *b, u32 hash, uns dirpos) +{ + struct page *p[2]; + uns i, rank, sigbit, rank_log; + + rank = sdbm_page_rank(d, dirpos); /* rank = # of pointers to this page */ + if (rank == 1) + { + sdbm_expand_directory(d); + rank = 2; + dirpos *= 2; + } + rank_log = 1; /* rank_log = log2(rank) */ + while ((1 << rank_log) < rank) + rank_log++; + sigbit = d->dir_shift + rank_log - 1; /* sigbit = bit we split on */ + p[0] = b; + p[1] = pgc_get_zero(d->cache, d->fd, sdbm_alloc_page(d)); + sdbm_split_data(d, (void *) b->data, (void *) p[0]->data, (void *) p[1]->data, sigbit); + sdbm_split_dir(d, (dirpos & ~(4*rank - 1))+2*rank, rank/2, pgc_page_pos(d->cache, p[1])); + pgc_mark_dirty(d->cache, p[0]); + i = (hash & (1 << sigbit)) ? 1 : 0; + pgc_put(d->cache, p[!i]); + return p[i]; +} + +static int +sdbm_access(struct sdbm *d, byte *key, uns keylen, byte *val, uns *vallen, uns mode) /* 0=read, 1=store, 2=replace */ +{ + struct page *p, *q; + u32 hash, h, pos, size; + struct sdbm_bucket *b; + byte *c, *e; + int rc; + + if ((d->key_size >= 0 && keylen != d->key_size) || keylen > 65535) + return SDBM_ERROR_BAD_KEY_SIZE; + if (val && ((d->val_size >= 0 && *vallen != d->val_size) || *vallen >= 65535)) + return SDBM_ERROR_BAD_VAL_SIZE; + if (!mode && !(d->flags & SDBM_WRITE)) + return SDBM_ERROR_READ_ONLY; + hash = sdbm_hash(key, keylen); + if (d->dir_shift != 32) /* avoid shifting by 32 bits */ + h = (hash >> d->dir_shift) << 2; /* offset in the directory */ + else + h = 0; + p = pgc_read(d->cache, d->fd, d->root->dir_start + (h & ~d->page_mask)); + pos = GET32(p->data, h & d->page_mask); + pgc_put(d->cache, p); + q = pgc_read(d->cache, d->fd, pos); + b = (void *) q->data; + c = b->data; + e = c + b->used; + while (c < e) + { + byte *K, *D; + uns Kl, Dl, s; + s = sdbm_get_entry(d, c, &K, &Kl, &D, &Dl); + if (Kl == keylen && !memcmp(K, key, Kl)) + { + /* Gotcha! */ + switch (mode) + { + case 0: /* fetch: found */ + rc = 1; + if (vallen) + { + if (*vallen < Dl) + rc = -3; + *vallen = Dl; + } + if (val) + memcpy(val, D, Dl); + pgc_put(d->cache, q); + return rc; + case 1: /* store: already present */ + pgc_put(d->cache, q); + return 0; + default: /* replace: delete the old one */ + memmove(c, c+s, e-(c+s)); + b->used -= s; + goto insert; + } + } + c += s; + } + if (!mode || !val) /* fetch or delete: no success */ + { + pgc_put(d->cache, q); + return 0; + } + +insert: + if (val) + { + size = sdbm_entry_len(d, keylen, *vallen); + while (b->used + size > d->page_size - sizeof(struct sdbm_bucket)) + { + /* Page overflow, need to split */ + q = sdbm_split_page(d, q, hash, h); + b = (void *) q->data; + } + sdbm_store_entry(d, b->data + b->used, key, keylen, val, *vallen); + b->used += size; + } + pgc_mark_dirty(d->cache, q); + pgc_put(d->cache, q); + if (d->flags & SDBM_SYNC) + sdbm_sync(d); + return 1; +} + +int +sdbm_store(struct sdbm *d, byte *key, uns keylen, byte *val, uns vallen) +{ + return sdbm_access(d, key, keylen, val, &vallen, 1); +} + +int +sdbm_replace(struct sdbm *d, byte *key, uns keylen, byte *val, uns vallen) +{ + return sdbm_access(d, key, keylen, val, &vallen, 2); +} + +int +sdbm_delete(struct sdbm *d, byte *key, uns keylen) +{ + return sdbm_access(d, key, keylen, NULL, NULL, 2); +} + +int +sdbm_fetch(struct sdbm *d, byte *key, uns keylen, byte *val, uns *vallen) +{ + return sdbm_access(d, key, keylen, val, vallen, 0); +} + +void +sdbm_rewind(struct sdbm *d) +{ +} + +int +sdbm_get_next(struct sdbm *d, byte *key, uns *keylen, byte *val, uns *vallen) +{ + return 0; +} + +void +sdbm_sync(struct sdbm *d) +{ + pgc_flush(d->cache); + if (d->flags & SDBM_FSYNC) + fsync(d->fd); +} + +#ifdef TEST + +int main(void) +{ + struct sdbm *d; + struct sdbm_options o = { + name: "db.test", + flags: SDBM_CREAT | SDBM_WRITE, + page_order: 10, + cache_size: 1024, + key_size: -1, + val_size: -1 + }; + byte buf[256]; + int i, j, k; + + puts("OPEN"); + d = sdbm_open(&o); + if (!d) + die("failed: %m"); + + puts("WRITE"); + for(i=0; i<1000000; i++) + { + sprintf(buf, "%d", i); + k = sdbm_store(d, buf, strlen(buf), (byte *) &i, sizeof(i)); +// printf("%s:%d\r", buf, k); + fflush(stdout); + } + sdbm_sync(d); + + puts("READ"); + for(i=0; i<1000000; i++) + { + sprintf(buf, "%d", i); + j = sdbm_fetch(d, buf, strlen(buf), NULL, NULL); +// printf("%s:%d\r", buf, j); + fflush(stdout); + if (!j) + { printf("\nERR: %s %d %x %d\n", buf, j, sdbm_hash(buf, strlen(buf)), d->dir_shift); return 1; } + } + puts(""); + + puts("CLOSE"); + sdbm_close(d); + puts("DONE"); + return 0; +} + +#endif diff --git a/lib/db.h b/lib/db.h new file mode 100644 index 00000000..78fc804a --- /dev/null +++ b/lib/db.h @@ -0,0 +1,50 @@ +/* + * Sherlock Library -- Fast Database Management Routines + * + * (c) 1999 Martin Mares + */ + +#ifndef _SHERLOCK_DB_H +#define _SHERLOCK_DB_H + +struct sdbm; + +struct sdbm_options { /* Set to 0 for default */ + char *name; /* File name */ + uns flags; /* See SDBM_xxx below */ + uns page_order; /* Binary logarithm of file page size */ + uns cache_size; /* Number of cached pages */ + int key_size; /* Key size, -1=variable */ + int val_size; /* Value size, -1=variable */ +}; + +struct sdbm_buf { + uns size; + byte data[0]; +}; + +struct sdbm *sdbm_open(struct sdbm_options *); +void sdbm_close(struct sdbm *); +int sdbm_store(struct sdbm *, byte *key, uns keylen, byte *val, uns vallen); +int sdbm_replace(struct sdbm *, byte *key, uns keylen, byte *val, uns vallen); /* val == NULL -> delete */ +int sdbm_delete(struct sdbm *, byte *key, uns keylen); +int sdbm_fetch(struct sdbm *, byte *key, uns keylen, byte *val, uns *vallen); /* val can be NULL */ +void sdbm_rewind(struct sdbm *); +int sdbm_get_next(struct sdbm *, byte *key, uns *keylen, byte *val, uns *vallen); /* val can be NULL */ +void sdbm_sync(struct sdbm *); + +#define SDBM_CREAT 1 /* Create the database if it doesn't exist */ +#define SDBM_WRITE 2 /* Open the database in read/write mode */ +#define SDBM_SYNC 4 /* Sync after each operation */ +#define SDBM_FAST 8 /* Don't sync on directory splits -- results in slightly faster + * operation, but reconstruction of database after program crash + * may be impossible. + */ +#define SDBM_FSYNC 16 /* When syncing, call fsync() */ + +#define SDBM_ERROR_BAD_KEY_SIZE -1 /* Fixed key size doesn't match */ +#define SDBM_ERROR_BAD_VAL_SIZE -2 /* Fixed value size doesn't match */ +#define SDBM_ERROR_TOO_LARGE -3 /* Key/value doesn't fit in buffer supplied */ +#define SDBM_ERROR_READ_ONLY -4 /* Database has been opened read only */ + +#endif diff --git a/lib/db_internal.h b/lib/db_internal.h new file mode 100644 index 00000000..fac74e6e --- /dev/null +++ b/lib/db_internal.h @@ -0,0 +1,53 @@ +/* + * Sherlock Library -- Fast Database Management Routines -- Internal Declarations + * + * (c) 1999 Martin Mares + */ + +#define SDBM_NUM_FREE_PAGE_POOLS 32 + +struct sdbm_root { + u32 magic; + u32 version; + u32 page_order; /* Binary logarithm of page size */ + s32 key_size; /* Key/val size, -1=variable */ + s32 val_size; + u32 dir_start; /* First page of the page directory */ + u32 dir_order; /* Binary logarithm of directory size */ + /* + * As we know the only thing which can be freed is the page directory + * and it can grow only a limited number of times, we can use a very + * simple-minded representation of the free page pool. We also assume + * these entries are sorted by start position. + */ + struct { + u32 first; + u32 count; + } free_pool[SDBM_NUM_FREE_PAGE_POOLS]; +}; + +struct sdbm_bucket { + u32 used; /* Bytes used in this bucket */ + byte data[0]; +}; + +struct sdbm { + struct page_cache *cache; + int fd; + struct sdbm_root *root; + struct page *root_page; + int key_size; /* Cached values from root page */ + int val_size; + uns page_order; + uns page_size; + uns page_mask; /* page_size - 1 */ + uns dir_size; /* Page directory size in entries */ + uns dir_shift; /* Number of significant bits of hash function */ + uns file_size; + uns flags; +}; + +#define SDBM_MAGIC 0x5344424d +#define SDBM_VERSION 1 + +#define GET32(p,o) *((u32 *)((p)+(o)))