DIRS+=lib/sorter
-LIBUCW_MODS+=sorter/config
+LIBUCW_MODS+=sorter/config sorter/govern
+
+$(o)/lib/sorter/sort-test: $(o)/lib/sorter/sort-test.o $(LIBUCW)
/*
- * UCW Library -- Universal Sorter
+ * UCW Library -- Universal Sorter: Common Declarations
*
* (c) 2007 Martin Mares <mj@ucw.cz>
*
#ifndef _UCW_SORTER_COMMON_H
#define _UCW_SORTER_COMMON_H
+#include "lib/clists.h"
+
/* Configuration, some of the variables are used by the old sorter, too. */
extern uns sorter_trace, sorter_presort_bufsize, sorter_stream_bufsize;
+struct sort_bucket {
+ cnode n;
+ uns flags;
+ struct fastbuf *fb;
+ byte *name;
+ u64 size; // Size in bytes
+ uns runs; // Number of runs, 0 if unknown
+ uns hash_bits; // Remaining bits of the hash function
+ byte *ident; // Identifier used in debug messages
+};
+
+enum sort_bucket_flags {
+ SBF_FINAL = 1, // This bucket corresponds to the final output file
+ SBF_SOURCE = 2, // Contains the source file
+};
+
+struct sort_context {
+ struct fastbuf *in_fb;
+ struct fastbuf *out_fb;
+ uns hash_bits;
+
+ struct mempool *pool;
+ clist bucket_list;
+ byte *big_buf, *big_buf_half;
+ uns big_buf_size, big_buf_half_size;
+
+ struct fastbuf *(*custom_presort)(void);
+ // Take as much as possible from the source bucket, sort it in memory and dump to destination bucket.
+ // Return 1 if there is more data available in the source bucket.
+ int (*internal_sort)(struct sort_context *ctx, struct sort_bucket *in, struct sort_bucket *out);
+ // Two-way split/merge: merge up to 2 source buckets to up to 2 destination buckets.
+ // Bucket arrays are NULL-terminated.
+ void (*twoway_merge)(struct sort_context *ctx, struct sort_bucket **ins, struct sort_bucket **outs);
+};
+
+void sorter_run(struct sort_context *ctx);
+
+struct sort_bucket *sorter_new_bucket(struct sort_context *ctx);
+struct fastbuf *sorter_open_read(struct sort_bucket *b);
+struct fastbuf *sorter_open_write(struct sort_bucket *b);
+void sorter_close_read(struct sort_bucket *b);
+void sorter_close_write(struct sort_bucket *b);
+
#endif
/*
- * UCW Library -- Universal Sorter -- Configuration
+ * UCW Library -- Universal Sorter: Configuration
*
* (c) 2007 Martin Mares <mj@ucw.cz>
*
--- /dev/null
+/*
+ * UCW Library -- Universal Sorter: Governing Routines
+ *
+ * (c) 2007 Martin Mares <mj@ucw.cz>
+ *
+ * This software may be freely distributed and used according to the terms
+ * of the GNU Lesser General Public License.
+ */
+
+#include "lib/lib.h"
+#include "lib/fastbuf.h"
+#include "lib/mempool.h"
+#include "lib/sorter/common.h"
+
+struct sort_bucket *
+sorter_new_bucket(struct sort_context *ctx)
+{
+ return mp_alloc_zero(ctx->pool, sizeof(struct sort_bucket));
+}
+
+struct fastbuf *
+sorter_open_read(struct sort_bucket *b)
+{
+ /* FIXME: These functions should handle buckets with no fb and only name. */
+ ASSERT(b->fb);
+ return b->fb;
+}
+
+struct fastbuf *
+sorter_open_write(struct sort_bucket *b)
+{
+ if (!b->fb)
+ b->fb = bopen_tmp(sorter_stream_bufsize);
+ return b->fb;
+}
+
+void
+sorter_close_read(struct sort_bucket *b)
+{
+ if (!b)
+ return;
+ ASSERT(b->fb);
+ bclose(b->fb);
+ b->fb = NULL;
+}
+
+void
+sorter_close_write(struct sort_bucket *b)
+{
+ if (b->fb)
+ {
+ b->size = btell(b->fb);
+ brewind(b->fb);
+ }
+ /* FIXME: Remove empty buckets from the list automatically? */
+}
+
+void
+sorter_run(struct sort_context *ctx)
+{
+ ctx->pool = mp_new(4096);
+ ASSERT(!ctx->custom_presort);
+ ASSERT(!ctx->out_fb);
+ clist_init(&ctx->bucket_list);
+
+ /* FIXME: There should be a way how to detect size of the input file */
+
+ /* Trivial 2-way merge with no presorting (just a testing hack) */
+ struct sort_bucket *bin = sorter_new_bucket(ctx);
+ bin->flags = SBF_SOURCE;
+ bin->fb = ctx->in_fb;
+ bin->ident = "src";
+ struct sort_bucket *ins[3], *outs[3];
+ ins[0] = bin;
+ ins[1] = NULL;
+
+ do {
+ outs[0] = sorter_new_bucket(ctx);
+ outs[1] = sorter_new_bucket(ctx);
+ outs[2] = NULL;
+ log(L_DEBUG, "Pass...");
+ ctx->twoway_merge(ctx, ins, outs);
+ log(L_DEBUG, "Done (%d+%d runs)", outs[0]->runs, outs[1]->runs);
+ sorter_close_write(outs[0]);
+ sorter_close_write(outs[1]);
+ memcpy(ins, outs, 3*sizeof(struct sort_bucket *));
+ } while (ins[1]->fb);
+
+ ctx->out_fb = sorter_open_read(ins[0]);
+}
--- /dev/null
+/*
+ * UCW Library -- Universal Sorter: Internal Sorting Module
+ *
+ * (c) 2007 Martin Mares <mj@ucw.cz>
+ *
+ * This software may be freely distributed and used according to the terms
+ * of the GNU Lesser General Public License.
+ */
+
+static int P(internal)(struct sort_context *ctx, struct sort_bucket *in, struct sort_bucket *out)
+{
+ /* FIXME :) */
+ return 0;
+}
--- /dev/null
+/*
+ * UCW Library -- Universal Sorter: Two-Way Merge Module
+ *
+ * (c) 2007 Martin Mares <mj@ucw.cz>
+ *
+ * This software may be freely distributed and used according to the terms
+ * of the GNU Lesser General Public License.
+ */
+
+/* FIXME: There is a plenty of room for further optimization */
+/* FIXME: Swap outputs if there already are some runs? */
+
+static void P(twoway_merge)(struct sort_context *ctx, struct sort_bucket **ins, struct sort_bucket **outs)
+{
+ struct fastbuf *fin1, *fin2, *fout1, *fout2, *ftmp;
+ P(key) kbuf1, kbuf2, kbuf3, kbuf4;
+ P(key) *kin1 = &kbuf1, *kprev1 = &kbuf2, *kin2 = &kbuf3, *kprev2 = &kbuf4;
+ P(key) *kout = NULL, *ktmp;
+ int next1, next2, run1, run2;
+ int comp;
+ uns run_count = 0;
+
+ fin1 = sorter_open_read(ins[0]);
+ next1 = P(read_key)(fin1, kin1);
+ if (ins[1])
+ {
+ fin2 = sorter_open_read(ins[1]);
+ next2 = P(read_key)(fin2, kin2);
+ }
+ else
+ {
+ fin2 = NULL;
+ next2 = 0;
+ }
+ fout1 = fout2 = NULL;
+
+ run1 = next1, run2 = next2;
+ while (next1 || next2)
+ {
+ if (!run1)
+ comp = 1;
+ else if (!run2)
+ comp = -1;
+ else
+ comp = P(compare)(kin1, kin2);
+ ktmp = (comp <= 0) ? kin1 : kin2;
+ if (!kout || !(P(compare)(kout, ktmp) LESS 0))
+ {
+ SWAP(fout1, fout2, ftmp);
+ if (unlikely(!fout1))
+ {
+ if (!fout2)
+ fout1 = sorter_open_write(outs[0]);
+ else if (outs[1])
+ fout1 = sorter_open_write(outs[1]);
+ else
+ fout1 = fout2;
+ }
+ run_count++;
+ }
+ if (comp LESS 0)
+ {
+ P(copy_data)(kin1, fin1, fout1);
+ SWAP(kin1, kprev1, ktmp);
+ next1 = P(read_key)(fin1, kin1);
+ run1 = next1 && (P(compare)(kprev1, kin1) LESS 0);
+ kout = kprev1;
+ }
+#ifdef SORT_MERGE
+ else if (comp == 0)
+ {
+ P(key) *mkeys[] = { kin1, kin2 };
+ struct fastbuf *mfb[] = { fin1, fin2 };
+ P(copy_merged)(mkeys, mfb, 2, fout1);
+ SWAP(kin1, kprev1, ktmp);
+ next1 = P(read_key)(fin1, kin1);
+ run1 = next1 && (P(compare)(kprev1, kin1) LESS 0);
+ SWAP(kin2, kprev2, ktmp);
+ next2 = P(read_key)(fin2, kin2);
+ run2 = next2 && (P(compare)(kprev2, kin2) LESS 0);
+ kout = kprev2;
+ }
+#endif
+#ifdef SORT_ASSERT_UNIQUE
+ else if (unlikely(comp == 0))
+ ASSERT(0);
+#endif
+ else
+ {
+ P(copy_data)(kin2, fin2, fout1);
+ SWAP(kin2, kprev2, ktmp);
+ next2 = P(read_key)(fin2, kin2);
+ run2 = next2 && (P(compare)(kprev2, kin2) LESS 0);
+ kout = kprev2;
+ }
+ if (!run1 && !run2)
+ {
+ run1 = next1;
+ run2 = next2;
+ }
+ }
+
+ sorter_close_read(ins[0]);
+ sorter_close_read(ins[1]);
+ if (fout2 && fout2 != fout1)
+ outs[1]->runs += run_count / 2;
+ if (fout1)
+ outs[0]->runs += (run_count+1) / 2;
+}
--- /dev/null
+/* A test of sorting routines */
+
+#include "lib/lib.h"
+#include "lib/getopt.h"
+#include "lib/fastbuf.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+
+struct key {
+ uns x;
+};
+
+#define SORT_KEY_REGULAR struct key
+#define SORT_PREFIX(x) s_##x
+#define SORT_INPUT_FB
+#define SORT_OUTPUT_FB
+#define SORT_INT(k) (k).x
+
+#include "lib/sorter/sorter.h"
+
+int
+main(int argc, char **argv)
+{
+ log_init(NULL);
+ if (cf_getopt(argc, argv, CF_SHORT_OPTS, CF_NO_LONG_OPTS, NULL) >= 0 ||
+ optind != argc - 2)
+ {
+ fputs("This program supports only the following command-line arguments:\n" CF_USAGE, stderr);
+ exit(1);
+ }
+
+ log(L_INFO, "Generating");
+ struct fastbuf *f = bopen(argv[optind], O_RDWR | O_CREAT | O_TRUNC, 65536);
+#define N 259309
+#define K 199483
+ for (uns i=0; i<N; i++)
+ bputl(f, ((u64)i * K + 17) % N);
+ brewind(f);
+
+ log(L_INFO, "Sorting");
+ f = s_sort(f, NULL, N-1);
+
+ log(L_INFO, "Verifying");
+ for (uns i=0; i<N; i++)
+ {
+ uns j = bgetl(f);
+ if (i != j)
+ die("Discrepancy: %d instead of %d", j, i);
+ }
+ bclose(f);
+
+ return 0;
+}
*
* Hashing (optional, but it can speed sorting up):
*
- * SORT_HASH_FN(key) returns a monotone hash of a given key. Monotone hash is a function f
- * such that f(x) < f(y) implies x < y and which is approximately uniformly
- * distributed.
- * SORT_HASH_BITS how many bits do the hashes have.
+ * SORT_HASH_BITS signals that a monotone hashing function returning a given number of
+ * bits is available. Monotone hash is a function f such that f(x) < f(y)
+ * implies x < y and which is approximately uniformly distributed.
+ * uns PREFIX_hash(SORT_KEY *a, SORT_KEY *b)
*
* Unification:
*
* SORT_OUTPUT_FB temporary fastbuf stream
* SORT_OUTPUT_THIS_FB a given fastbuf stream which can already contain a header
*
+ * Other switches:
+ *
+ * SORT_UNIQUE all items have distinct keys (checked in debug mode)
+ *
* FIXME: Maybe implement these:
- * ??? SORT_UNIQUE all items have distinct keys (checked in debug mode)
* ??? SORT_DELETE_INPUT a C expression, if true, the input files are
* deleted as soon as possible
* ??? SORT_ALIGNED
*
* The function generated:
*
- * <outfb> PREFIX_SORT(<in>, <out>, <range>), where:
- * <in> = input file name/fastbuf
- * <out> = output file name/fastbuf
+ * <outfb> PREFIX_SORT(<in>, <out> [,<range>]), where:
+ * <in> = input file name/fastbuf or NULL
+ * <out> = output file name/fastbuf or NULL
* <range> = maximum integer value for the SORT_INT mode
* <outfb> = output fastbuf (in SORT_OUTPUT_FB mode)
- * (any parameter can be missing if it is not applicable).
- *
- * void PREFIX_merge_data(struct fastbuf *src1, *src2, *dest, SORT_KEY *k1, *k2)
- * [used only in case SORT_UNIFY is defined]
- * write just fetched key k to dest and merge data from
- * two records with the same key (k1 and k2 are key occurences
- * in the corresponding streams).
- * SORT_KEY * PREFIX_merge_items(SORT_KEY *a, SORT_KEY *b)
- * [used only with SORT_PRESORT && SORT_UNIFY]
- * merge two items with the same key, returns pointer
- * to at most one of the items, the rest will be removed
- * from the list of items, but not deallocated, so
- * the remaining item can freely reference data of the
- * other one.
*
* After including this file, all parameter macros are automatically
* undef'd.
*/
+#include "lib/sorter/common.h"
+#include "lib/fastbuf.h"
+
+#include <fcntl.h>
+
#define P(x) SORT_PREFIX(x)
+#ifdef SORT_KEY_REGULAR
+typedef SORT_KEY_REGULAR P(key);
+static inline int P(read_key) (struct fastbuf *f, P(key) *k)
+{
+ return breadb(f, k, sizeof(P(key)));
+}
+static inline void P(write_key) (struct fastbuf *f, P(key) *k)
+{
+ bwrite(f, k, sizeof(P(key)));
+}
+#elif defined(SORT_KEY)
+typedef SORT_KEY P(key);
+#else
+#error Missing definition of sorting key.
+#endif
+
+#ifdef SORT_INT
+static inline int P(compare) (P(key) *x, P(key) *y)
+{
+ if (SORT_INT(*x) < SORT_INT(*y))
+ return -1;
+ if (SORT_INT(*x) > SORT_INT(*y))
+ return 1;
+ return 0;
+}
+
+#ifndef SORT_HASH_BITS
+static inline int P(hash) (P(key) *x)
+{
+ return SORT_INT((*x));
+}
+#endif
+#endif
+
+#ifdef SORT_MERGE
+#define LESS <
+#else
+#define LESS <=
+#endif
+#define SWAP(x,y,z) do { z=x; x=y; y=z; } while(0)
+
+#if defined(SORT_UNIQUE) && defined(DEBUG_ASSERTS)
+#define SORT_ASSERT_UNIQUE
+#endif
+
+static inline void P(copy_data)(P(key) *key, struct fastbuf *in, struct fastbuf *out)
+{
+ bwrite(out, key, sizeof(P(key)));
+#ifdef SORT_DATA_SIZE
+ bbcopy(in, out, SORT_DATA_SIZE(*key));
+#else
+ (void) in;
+#endif
+}
+
+#include "lib/sorter/s-internal.h"
+#include "lib/sorter/s-twoway.h"
+
+static struct fastbuf *P(sort)(
+#ifdef SORT_INPUT_FILE
+ byte *in,
+#else
+ struct fastbuf *in,
+#endif
+#ifdef SORT_OUTPUT_FILE
+ byte *out
+#else
+ struct fastbuf *out
+#endif
+#ifdef SORT_INT
+ , uns int_range
+#endif
+ )
+{
+ struct sort_context ctx;
+ bzero(&ctx, sizeof(ctx));
+
+#ifdef SORT_INPUT_FILE
+ ctx.in_fb = bopen(in, O_RDONLY, sorter_stream_bufsize);
+#elif defined(SORT_INPUT_FB)
+ ctx.in_fb = in;
+#elif defined(SORT_INPUT_PRESORT)
+ ASSERT(!in);
+ ctx.custom_presort = P(presorter);
+#else
+#error No input given.
+#endif
+
+#ifdef SORT_OUTPUT_FB
+ ASSERT(!out);
+#elif defined(SORT_OUTPUT_THIS_FB)
+ ctx.out_fb = out;
+#elif defined(SORT_OUTPUT_FILE)
+ /* Just assume fastbuf output and rename the fastbuf later */
+#else
+#error No output given.
+#endif
+
+#ifdef SORT_HASH_BITS
+ ctx.hash_bits = SORT_HASH_BITS;
+#elif defined(SORT_INT)
+ ctx.hash_bits = 0;
+ while (ctx.hash_bits < 32 && (int_range >> ctx.hash_bits))
+ ctx.hash_bits++;
+#endif
+
+ ctx.internal_sort = P(internal);
+ ctx.twoway_merge = P(twoway_merge);
+
+ sorter_run(&ctx);
+
+#ifdef SORT_OUTPUT_FILE
+ if (rename(ctx.out_fb->name, out) < 0)
+ die("Cannot rename %s to %s: %m", ctx.out_fb->name, out);
+ bconfig(ctx.out_fb, BCONFIG_IS_TEMP_FILE, 0);
+ bclose(ctx.out_fb);
+ ctx.out_fb = NULL;
+#endif
+ return ctx.out_fb;
+}
+
+#undef SORT_KEY
+#undef SORT_KEY_REGULAR
+#undef SORT_KEY_SIZE
+#undef SORT_DATA_SIZE
+#undef SORT_INT
+#undef SORT_HASH_BITS
+#undef SORT_MERGE
+#undef SORT_INPUT_FILE
+#undef SORT_INPUT_FB
+#undef SORT_INPUT_PRESORT
+#undef SORT_OUTPUT_FILE
+#undef SORT_OUTPUT_FB
+#undef SORT_OUTPUT_THIS_FB
+#undef SORT_UNIQUE
+#undef SORT_ASSERT_UNIQUE
+#undef SWAP
+#undef LESS
#undef P
/* FIXME: Check that we undef everything we should. */