From 9ff2d1d3d98e39cfe57e38519427a7754d73cb6c Mon Sep 17 00:00:00 2001 From: Martin Mares Date: Wed, 31 Jan 2007 22:45:41 +0100 Subject: [PATCH] Several bits of the new sorter. The basic infrastructure seems to fit well. The 2-way merge module is adapted from the old sorter and it's definitely worth some more optimization. The sort-test really works :-) More to come tomorrow. --- lib/sorter/Makefile | 4 +- lib/sorter/common.h | 47 ++++++++++- lib/sorter/config.c | 2 +- lib/sorter/govern.c | 90 ++++++++++++++++++++ lib/sorter/s-internal.h | 14 ++++ lib/sorter/s-twoway.h | 109 +++++++++++++++++++++++++ lib/sorter/sort-test.c | 56 +++++++++++++ lib/sorter/sorter.h | 176 +++++++++++++++++++++++++++++++++++----- 8 files changed, 473 insertions(+), 25 deletions(-) create mode 100644 lib/sorter/govern.c create mode 100644 lib/sorter/s-internal.h create mode 100644 lib/sorter/s-twoway.h create mode 100644 lib/sorter/sort-test.c diff --git a/lib/sorter/Makefile b/lib/sorter/Makefile index b1388898..ad14a927 100644 --- a/lib/sorter/Makefile +++ b/lib/sorter/Makefile @@ -2,4 +2,6 @@ DIRS+=lib/sorter -LIBUCW_MODS+=sorter/config +LIBUCW_MODS+=sorter/config sorter/govern + +$(o)/lib/sorter/sort-test: $(o)/lib/sorter/sort-test.o $(LIBUCW) diff --git a/lib/sorter/common.h b/lib/sorter/common.h index 76d3e8c8..9747a5e0 100644 --- a/lib/sorter/common.h +++ b/lib/sorter/common.h @@ -1,5 +1,5 @@ /* - * UCW Library -- Universal Sorter + * UCW Library -- Universal Sorter: Common Declarations * * (c) 2007 Martin Mares * @@ -10,7 +10,52 @@ #ifndef _UCW_SORTER_COMMON_H #define _UCW_SORTER_COMMON_H +#include "lib/clists.h" + /* Configuration, some of the variables are used by the old sorter, too. */ extern uns sorter_trace, sorter_presort_bufsize, sorter_stream_bufsize; +struct sort_bucket { + cnode n; + uns flags; + struct fastbuf *fb; + byte *name; + u64 size; // Size in bytes + uns runs; // Number of runs, 0 if unknown + uns hash_bits; // Remaining bits of the hash function + byte *ident; // Identifier used in debug messages +}; + +enum sort_bucket_flags { + SBF_FINAL = 1, // This bucket corresponds to the final output file + SBF_SOURCE = 2, // Contains the source file +}; + +struct sort_context { + struct fastbuf *in_fb; + struct fastbuf *out_fb; + uns hash_bits; + + struct mempool *pool; + clist bucket_list; + byte *big_buf, *big_buf_half; + uns big_buf_size, big_buf_half_size; + + struct fastbuf *(*custom_presort)(void); + // Take as much as possible from the source bucket, sort it in memory and dump to destination bucket. + // Return 1 if there is more data available in the source bucket. + int (*internal_sort)(struct sort_context *ctx, struct sort_bucket *in, struct sort_bucket *out); + // Two-way split/merge: merge up to 2 source buckets to up to 2 destination buckets. + // Bucket arrays are NULL-terminated. + void (*twoway_merge)(struct sort_context *ctx, struct sort_bucket **ins, struct sort_bucket **outs); +}; + +void sorter_run(struct sort_context *ctx); + +struct sort_bucket *sorter_new_bucket(struct sort_context *ctx); +struct fastbuf *sorter_open_read(struct sort_bucket *b); +struct fastbuf *sorter_open_write(struct sort_bucket *b); +void sorter_close_read(struct sort_bucket *b); +void sorter_close_write(struct sort_bucket *b); + #endif diff --git a/lib/sorter/config.c b/lib/sorter/config.c index 15872076..0026316b 100644 --- a/lib/sorter/config.c +++ b/lib/sorter/config.c @@ -1,5 +1,5 @@ /* - * UCW Library -- Universal Sorter -- Configuration + * UCW Library -- Universal Sorter: Configuration * * (c) 2007 Martin Mares * diff --git a/lib/sorter/govern.c b/lib/sorter/govern.c new file mode 100644 index 00000000..aece061c --- /dev/null +++ b/lib/sorter/govern.c @@ -0,0 +1,90 @@ +/* + * UCW Library -- Universal Sorter: Governing Routines + * + * (c) 2007 Martin Mares + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#include "lib/lib.h" +#include "lib/fastbuf.h" +#include "lib/mempool.h" +#include "lib/sorter/common.h" + +struct sort_bucket * +sorter_new_bucket(struct sort_context *ctx) +{ + return mp_alloc_zero(ctx->pool, sizeof(struct sort_bucket)); +} + +struct fastbuf * +sorter_open_read(struct sort_bucket *b) +{ + /* FIXME: These functions should handle buckets with no fb and only name. */ + ASSERT(b->fb); + return b->fb; +} + +struct fastbuf * +sorter_open_write(struct sort_bucket *b) +{ + if (!b->fb) + b->fb = bopen_tmp(sorter_stream_bufsize); + return b->fb; +} + +void +sorter_close_read(struct sort_bucket *b) +{ + if (!b) + return; + ASSERT(b->fb); + bclose(b->fb); + b->fb = NULL; +} + +void +sorter_close_write(struct sort_bucket *b) +{ + if (b->fb) + { + b->size = btell(b->fb); + brewind(b->fb); + } + /* FIXME: Remove empty buckets from the list automatically? */ +} + +void +sorter_run(struct sort_context *ctx) +{ + ctx->pool = mp_new(4096); + ASSERT(!ctx->custom_presort); + ASSERT(!ctx->out_fb); + clist_init(&ctx->bucket_list); + + /* FIXME: There should be a way how to detect size of the input file */ + + /* Trivial 2-way merge with no presorting (just a testing hack) */ + struct sort_bucket *bin = sorter_new_bucket(ctx); + bin->flags = SBF_SOURCE; + bin->fb = ctx->in_fb; + bin->ident = "src"; + struct sort_bucket *ins[3], *outs[3]; + ins[0] = bin; + ins[1] = NULL; + + do { + outs[0] = sorter_new_bucket(ctx); + outs[1] = sorter_new_bucket(ctx); + outs[2] = NULL; + log(L_DEBUG, "Pass..."); + ctx->twoway_merge(ctx, ins, outs); + log(L_DEBUG, "Done (%d+%d runs)", outs[0]->runs, outs[1]->runs); + sorter_close_write(outs[0]); + sorter_close_write(outs[1]); + memcpy(ins, outs, 3*sizeof(struct sort_bucket *)); + } while (ins[1]->fb); + + ctx->out_fb = sorter_open_read(ins[0]); +} diff --git a/lib/sorter/s-internal.h b/lib/sorter/s-internal.h new file mode 100644 index 00000000..911b74e2 --- /dev/null +++ b/lib/sorter/s-internal.h @@ -0,0 +1,14 @@ +/* + * UCW Library -- Universal Sorter: Internal Sorting Module + * + * (c) 2007 Martin Mares + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +static int P(internal)(struct sort_context *ctx, struct sort_bucket *in, struct sort_bucket *out) +{ + /* FIXME :) */ + return 0; +} diff --git a/lib/sorter/s-twoway.h b/lib/sorter/s-twoway.h new file mode 100644 index 00000000..6bbdf28d --- /dev/null +++ b/lib/sorter/s-twoway.h @@ -0,0 +1,109 @@ +/* + * UCW Library -- Universal Sorter: Two-Way Merge Module + * + * (c) 2007 Martin Mares + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +/* FIXME: There is a plenty of room for further optimization */ +/* FIXME: Swap outputs if there already are some runs? */ + +static void P(twoway_merge)(struct sort_context *ctx, struct sort_bucket **ins, struct sort_bucket **outs) +{ + struct fastbuf *fin1, *fin2, *fout1, *fout2, *ftmp; + P(key) kbuf1, kbuf2, kbuf3, kbuf4; + P(key) *kin1 = &kbuf1, *kprev1 = &kbuf2, *kin2 = &kbuf3, *kprev2 = &kbuf4; + P(key) *kout = NULL, *ktmp; + int next1, next2, run1, run2; + int comp; + uns run_count = 0; + + fin1 = sorter_open_read(ins[0]); + next1 = P(read_key)(fin1, kin1); + if (ins[1]) + { + fin2 = sorter_open_read(ins[1]); + next2 = P(read_key)(fin2, kin2); + } + else + { + fin2 = NULL; + next2 = 0; + } + fout1 = fout2 = NULL; + + run1 = next1, run2 = next2; + while (next1 || next2) + { + if (!run1) + comp = 1; + else if (!run2) + comp = -1; + else + comp = P(compare)(kin1, kin2); + ktmp = (comp <= 0) ? kin1 : kin2; + if (!kout || !(P(compare)(kout, ktmp) LESS 0)) + { + SWAP(fout1, fout2, ftmp); + if (unlikely(!fout1)) + { + if (!fout2) + fout1 = sorter_open_write(outs[0]); + else if (outs[1]) + fout1 = sorter_open_write(outs[1]); + else + fout1 = fout2; + } + run_count++; + } + if (comp LESS 0) + { + P(copy_data)(kin1, fin1, fout1); + SWAP(kin1, kprev1, ktmp); + next1 = P(read_key)(fin1, kin1); + run1 = next1 && (P(compare)(kprev1, kin1) LESS 0); + kout = kprev1; + } +#ifdef SORT_MERGE + else if (comp == 0) + { + P(key) *mkeys[] = { kin1, kin2 }; + struct fastbuf *mfb[] = { fin1, fin2 }; + P(copy_merged)(mkeys, mfb, 2, fout1); + SWAP(kin1, kprev1, ktmp); + next1 = P(read_key)(fin1, kin1); + run1 = next1 && (P(compare)(kprev1, kin1) LESS 0); + SWAP(kin2, kprev2, ktmp); + next2 = P(read_key)(fin2, kin2); + run2 = next2 && (P(compare)(kprev2, kin2) LESS 0); + kout = kprev2; + } +#endif +#ifdef SORT_ASSERT_UNIQUE + else if (unlikely(comp == 0)) + ASSERT(0); +#endif + else + { + P(copy_data)(kin2, fin2, fout1); + SWAP(kin2, kprev2, ktmp); + next2 = P(read_key)(fin2, kin2); + run2 = next2 && (P(compare)(kprev2, kin2) LESS 0); + kout = kprev2; + } + if (!run1 && !run2) + { + run1 = next1; + run2 = next2; + } + } + + sorter_close_read(ins[0]); + sorter_close_read(ins[1]); + if (fout2 && fout2 != fout1) + outs[1]->runs += run_count / 2; + if (fout1) + outs[0]->runs += (run_count+1) / 2; +} diff --git a/lib/sorter/sort-test.c b/lib/sorter/sort-test.c new file mode 100644 index 00000000..c17333c3 --- /dev/null +++ b/lib/sorter/sort-test.c @@ -0,0 +1,56 @@ +/* A test of sorting routines */ + +#include "lib/lib.h" +#include "lib/getopt.h" +#include "lib/fastbuf.h" + +#include +#include +#include +#include + +struct key { + uns x; +}; + +#define SORT_KEY_REGULAR struct key +#define SORT_PREFIX(x) s_##x +#define SORT_INPUT_FB +#define SORT_OUTPUT_FB +#define SORT_INT(k) (k).x + +#include "lib/sorter/sorter.h" + +int +main(int argc, char **argv) +{ + log_init(NULL); + if (cf_getopt(argc, argv, CF_SHORT_OPTS, CF_NO_LONG_OPTS, NULL) >= 0 || + optind != argc - 2) + { + fputs("This program supports only the following command-line arguments:\n" CF_USAGE, stderr); + exit(1); + } + + log(L_INFO, "Generating"); + struct fastbuf *f = bopen(argv[optind], O_RDWR | O_CREAT | O_TRUNC, 65536); +#define N 259309 +#define K 199483 + for (uns i=0; i PREFIX_SORT(, , ), where: - * = input file name/fastbuf - * = output file name/fastbuf + * PREFIX_SORT(, [,]), where: + * = input file name/fastbuf or NULL + * = output file name/fastbuf or NULL * = maximum integer value for the SORT_INT mode * = output fastbuf (in SORT_OUTPUT_FB mode) - * (any parameter can be missing if it is not applicable). - * - * void PREFIX_merge_data(struct fastbuf *src1, *src2, *dest, SORT_KEY *k1, *k2) - * [used only in case SORT_UNIFY is defined] - * write just fetched key k to dest and merge data from - * two records with the same key (k1 and k2 are key occurences - * in the corresponding streams). - * SORT_KEY * PREFIX_merge_items(SORT_KEY *a, SORT_KEY *b) - * [used only with SORT_PRESORT && SORT_UNIFY] - * merge two items with the same key, returns pointer - * to at most one of the items, the rest will be removed - * from the list of items, but not deallocated, so - * the remaining item can freely reference data of the - * other one. * * After including this file, all parameter macros are automatically * undef'd. */ +#include "lib/sorter/common.h" +#include "lib/fastbuf.h" + +#include + #define P(x) SORT_PREFIX(x) +#ifdef SORT_KEY_REGULAR +typedef SORT_KEY_REGULAR P(key); +static inline int P(read_key) (struct fastbuf *f, P(key) *k) +{ + return breadb(f, k, sizeof(P(key))); +} +static inline void P(write_key) (struct fastbuf *f, P(key) *k) +{ + bwrite(f, k, sizeof(P(key))); +} +#elif defined(SORT_KEY) +typedef SORT_KEY P(key); +#else +#error Missing definition of sorting key. +#endif + +#ifdef SORT_INT +static inline int P(compare) (P(key) *x, P(key) *y) +{ + if (SORT_INT(*x) < SORT_INT(*y)) + return -1; + if (SORT_INT(*x) > SORT_INT(*y)) + return 1; + return 0; +} + +#ifndef SORT_HASH_BITS +static inline int P(hash) (P(key) *x) +{ + return SORT_INT((*x)); +} +#endif +#endif + +#ifdef SORT_MERGE +#define LESS < +#else +#define LESS <= +#endif +#define SWAP(x,y,z) do { z=x; x=y; y=z; } while(0) + +#if defined(SORT_UNIQUE) && defined(DEBUG_ASSERTS) +#define SORT_ASSERT_UNIQUE +#endif + +static inline void P(copy_data)(P(key) *key, struct fastbuf *in, struct fastbuf *out) +{ + bwrite(out, key, sizeof(P(key))); +#ifdef SORT_DATA_SIZE + bbcopy(in, out, SORT_DATA_SIZE(*key)); +#else + (void) in; +#endif +} + +#include "lib/sorter/s-internal.h" +#include "lib/sorter/s-twoway.h" + +static struct fastbuf *P(sort)( +#ifdef SORT_INPUT_FILE + byte *in, +#else + struct fastbuf *in, +#endif +#ifdef SORT_OUTPUT_FILE + byte *out +#else + struct fastbuf *out +#endif +#ifdef SORT_INT + , uns int_range +#endif + ) +{ + struct sort_context ctx; + bzero(&ctx, sizeof(ctx)); + +#ifdef SORT_INPUT_FILE + ctx.in_fb = bopen(in, O_RDONLY, sorter_stream_bufsize); +#elif defined(SORT_INPUT_FB) + ctx.in_fb = in; +#elif defined(SORT_INPUT_PRESORT) + ASSERT(!in); + ctx.custom_presort = P(presorter); +#else +#error No input given. +#endif + +#ifdef SORT_OUTPUT_FB + ASSERT(!out); +#elif defined(SORT_OUTPUT_THIS_FB) + ctx.out_fb = out; +#elif defined(SORT_OUTPUT_FILE) + /* Just assume fastbuf output and rename the fastbuf later */ +#else +#error No output given. +#endif + +#ifdef SORT_HASH_BITS + ctx.hash_bits = SORT_HASH_BITS; +#elif defined(SORT_INT) + ctx.hash_bits = 0; + while (ctx.hash_bits < 32 && (int_range >> ctx.hash_bits)) + ctx.hash_bits++; +#endif + + ctx.internal_sort = P(internal); + ctx.twoway_merge = P(twoway_merge); + + sorter_run(&ctx); + +#ifdef SORT_OUTPUT_FILE + if (rename(ctx.out_fb->name, out) < 0) + die("Cannot rename %s to %s: %m", ctx.out_fb->name, out); + bconfig(ctx.out_fb, BCONFIG_IS_TEMP_FILE, 0); + bclose(ctx.out_fb); + ctx.out_fb = NULL; +#endif + return ctx.out_fb; +} + +#undef SORT_KEY +#undef SORT_KEY_REGULAR +#undef SORT_KEY_SIZE +#undef SORT_DATA_SIZE +#undef SORT_INT +#undef SORT_HASH_BITS +#undef SORT_MERGE +#undef SORT_INPUT_FILE +#undef SORT_INPUT_FB +#undef SORT_INPUT_PRESORT +#undef SORT_OUTPUT_FILE +#undef SORT_OUTPUT_FB +#undef SORT_OUTPUT_THIS_FB +#undef SORT_UNIQUE +#undef SORT_ASSERT_UNIQUE +#undef SWAP +#undef LESS #undef P /* FIXME: Check that we undef everything we should. */ -- 2.39.2