From: Martin Mares Date: Sun, 4 Feb 2001 14:44:54 +0000 (+0000) Subject: First version of the sorter. No presorting phase yet. X-Git-Tag: holmes-import~1563 X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=23f2f5abfec1098ce98afa88b8308b869d59216c;p=libucw.git First version of the sorter. No presorting phase yet. --- diff --git a/lib/Makefile b/lib/Makefile index 918f34bb..5be7ccef 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -1,12 +1,12 @@ -# Makefile for the Sherlock Library (c) 1997--2000 Martin Mares +# Makefile for the Sherlock Library (c) 1997--2001 Martin Mares DIRS+=lib -PROGS+=obj/lib/db-test obj/lib/db-rebuild obj/lib/buckettool obj/lib/conf-test +PROGS+=obj/lib/db-test obj/lib/db-rebuild obj/lib/buckettool SHLIB_OBJS=alloc.o alloc_str.o ctmatch.o db.o fastbuf.o fb-file.o fb-mem.o lists.o \ log.o log2.o md5.o md5hex.o mmap.o pagecache.o patimatch.o patmatch.o pool.o \ prime.o random.o realloc.o regex.o temp.o timer.o url.o wildmatch.o \ - wordsplit.o str_ctype.o str_upper.o bucket.o conf.o object.o + wordsplit.o str_ctype.o str_upper.o bucket.o conf.o object.o sorter.o obj/lib/libsh.a: $(addprefix obj/lib/,$(SHLIB_OBJS)) @@ -14,3 +14,4 @@ obj/lib/db-test: obj/lib/db-test.o obj/lib/libsh.a obj/lib/db-rebuild: obj/lib/db-rebuild.o obj/lib/libsh.a obj/lib/buckettool: obj/lib/buckettool.o obj/lib/libsh.a obj/lib/conf-test: obj/lib/conf-test.o obj/lib/libsh.a +obj/lib/sort-test: obj/lib/sort-test.o obj/lib/libsh.a diff --git a/lib/sort-test.c b/lib/sort-test.c new file mode 100644 index 00000000..2f61b152 --- /dev/null +++ b/lib/sort-test.c @@ -0,0 +1,50 @@ +/* Test for sorting routines */ + +#include "lib/lib.h" +#include "lib/conf.h" +#include "lib/fastbuf.h" + +#include +#include + +struct key { + char line[1024]; +}; + +#define SORT_KEY struct key +#define SORT_PREFIX(x) s_##x +#define SORT_INPUT_FILE +#define SORT_OUTPUT_FILE + +static inline int +s_compare(struct key *a, struct key *b) +{ + return strcmp(a->line, b->line); +} + +static inline int +s_fetch_key(struct fastbuf *f, struct key *a) +{ + return !!bgets(f, a->line, sizeof(a->line)); +} + +static inline void +s_copy_data(struct fastbuf *src UNUSED, struct fastbuf *dest, struct key *k) +{ + bputsn(dest, k->line); +} + +#include "lib/sorter.h" + +int +main(int argc, char **argv) +{ + log_init(NULL); + cf_read(DEFAULT_CONFIG); + if (cf_getopt(argc, argv, CF_SHORT_OPTS, CF_NO_LONG_OPTS, NULL) >= 0 || + optind != argc - 2) + die("Usage: sort-test "); + + s_sort(argv[optind], argv[optind+1]); + return 0; +} diff --git a/lib/sorter.c b/lib/sorter.c new file mode 100644 index 00000000..b43672f4 --- /dev/null +++ b/lib/sorter.c @@ -0,0 +1,49 @@ +/* + * Sherlock Library -- Universal Sorter + * + * (c) 2001 Martin Mares + */ + +#include "lib/lib.h" +#include "lib/conf.h" +#include "lib/fastbuf.h" + +#include +#include + +#define SORT_DECLARE_ONLY +#include "lib/sorter.h" + +uns sorter_trace; +uns sorter_presort_bufsize = 65536; +uns sorter_stream_bufsize = 65536; +static byte *sorter_template = "/tmp/sort%d.%d"; + +static struct cfitem sorter_config[] = { + { "Sorter", CT_SECTION, NULL }, + { "Trace", CT_INT, &sorter_trace }, + { "PresortBuffer", CT_INT, &sorter_presort_bufsize }, + { "StreamBuffer", CT_INT, &sorter_stream_bufsize }, + { "TempLate", CT_STRING, &sorter_template }, + { NULL, CT_STOP, NULL } +}; + +static void CONSTRUCTOR sorter_init_config(void) +{ + cf_register(sorter_config); +} + +uns sorter_pass_counter; +uns sorter_file_counter; + +struct fastbuf * +sorter_open_tmp(void) +{ + byte buf[256]; + struct fastbuf *f; + + sprintf(buf, sorter_template, (int) getpid(), sorter_file_counter++); + f = bopen(buf, O_RDWR | O_CREAT | O_EXCL, sorter_stream_bufsize); + f->is_temp_file = 1; + return f; +} diff --git a/lib/sorter.h b/lib/sorter.h new file mode 100644 index 00000000..0ed5e04d --- /dev/null +++ b/lib/sorter.h @@ -0,0 +1,245 @@ +/* + * Sherlock Library -- Universal Sorter + * + * (c) 2001 Martin Mares + */ + +/* + * This is not a normal header file, it's a generator of sorting + * routines. Each time you include it with parameters set in the + * corresponding preprocessor macros, it generates a file sorter + * with the parameters given. + * + * Recognized parameter macros: (those marked with [*] are mandatory) + * + * SORT_KEY [*] data type capable of storing a single key + * SORT_PREFIX(x) [*] add a name prefix (used on all global names + * defined by the sorter) + * SORT_PRESORT include an in-core presorting pass + * SORT_UNIFY merge items with identical keys + * SORT_DELETE_INPUT a C expression, if true, the input files are + * deleted as soon as possible + * SORT_INPUT_FILE input is a file with this name + * SORT_INPUT_FB input is a fastbuf stream + * SORT_INPUT_FBPAIR input is a pair of fastbuf streams + * (not supported by the presorter) + * SORT_OUTPUT_FILE output is a file with this name + * SORT_OUTPUT_FB output is a fastbuf stream + * + * You also need to define some (usually inline) functions which + * are called by the sorter to process your data: + * + * int PREFIX_compare(SORT_KEY *a, *b) + * compare two keys, result like strcmp + * int PREFIX_fetch_key(struct fastbuf *f, SORT_KEY *k) + * fetch next key, returns 1=ok, 0=eof + * void PREFIX_copy_data(struct fastbuf *src, *dest, SORT_KEY *k) + * write just fetched key k to dest and copy all data + * belonging to this key from src to dest. + * void PREFIX_merge_data(struct fastbuf *src1, *src2, *dest, SORT_KEY *k1, *k2) + * [used only in case SORT_UNIFY is defined] + * write just fetched key k to dest and merge data from + * two records with the same key (k1 and k2 are key occurences + * in the corresponding streams). + * char * PREFIX_fetch_item(struct fastbuf *f, SORT_KEY *k, char *limit) + * [used only with SORT_PRESORT] + * fetch data belonging to a just fetched key and store + * them to memory following the key, but not over limit. + * Returns a pointer to first byte after the data + * or NULL if the data don't fit. + * Important: keys carrying no data must be position + * independent. + * void PREFIX_store_item(struct fastbuf *f, SORT_KEY *k) + * [used only with SORT_PRESORT] + * write key and all its data read with PREFIX_fetch_data + * to the stream given. + * SORT_KEY * PREFIX_merge_items(SORT_KEY *a, SORT_KEY *b) + * [used only with SORT_PRESORT && SORT_UNIFY] + * merge two items with the same key, returns pointer + * to at most one of the items, the rest will be removed + * from the list of items, but not deallocated, so + * the remaining item can freely reference data of the + * other one. + */ + +/* Declarations of externals from sorter.c */ + +#ifndef SORT_DECLS_READ +#define SORT_DECLS_READ + +extern uns sorter_trace; +extern uns sorter_presort_bufsize; +extern uns sorter_stream_bufsize; + +extern uns sorter_pass_counter, sorter_file_counter; +struct fastbuf *sorter_open_tmp(void); + +#endif /* !SORT_DECLS_READ */ + +/* The sorter proper */ + +#ifndef SORT_DECLARE_ONLY + +#include "lib/fastbuf.h" +#include +#include + +#if !defined(SORT_KEY) || !defined(SORT_PREFIX) +#error Some of the mandatory configuration macros are missing. +#endif + +#define P(x) SORT_PREFIX(x) +#define SWAP(x,y,z) do { z=x; x=y; y=z; } while(0) + +#if defined(SORT_UNIFY) || defined(SORT_UNIQUE) +#define LESS < +#else +#define LESS <= +#endif + +static void +P(pass)(struct fastbuf **fb1, struct fastbuf **fb2) +{ + struct fastbuf *in1 = *fb1; + struct fastbuf *in2 = *fb2; + struct fastbuf *out1 = NULL; + struct fastbuf *out2 = NULL; + SORT_KEY kbuf1, kbuf2, kbuf3, kbuf4; + SORT_KEY *kin1 = &kbuf1; + SORT_KEY *kprev1 = &kbuf2; + SORT_KEY *kin2 = &kbuf3; + SORT_KEY *kprev2 = &kbuf4; + SORT_KEY *kout = NULL; + SORT_KEY *ktmp; + int next1, next2, comp; + int run1, run2; + uns run_count = 0; + + run1 = next1 = in1 ? P(fetch_key)(in1, kin1) : 0; + run2 = next2 = in2 ? P(fetch_key)(in2, kin2) : 0; + while (next1 || next2) + { + if (!run1) + comp = 1; + else if (!run2) + comp = -1; + else + comp = P(compare)(kin1, kin2); + ktmp = (comp <= 0) ? kin1 : kin2; + if (!kout || !(P(compare)(kout, ktmp) LESS 0)) + { + struct fastbuf *t; + SWAP(out1, out2, t); + if (!out1) + out1 = sorter_open_tmp(); + run_count++; + } + if (comp LESS 0) + { + P(copy_data)(in1, out1, kin1); + SWAP(kin1, kprev1, ktmp); + next1 = P(fetch_key)(in1, kin1); + run1 = next1 && (P(compare)(kprev1, kin1) LESS 0); + kout = kprev1; + } +#ifdef SORT_UNIFY + else if (comp == 0) + { + P(merge_data)(in1, in2, out1, kin1, kin2); + SWAP(kin1, kprev1, ktmp); + next1 = P(fetch_key)(in1, kin1); /* FIXME: Re-use other code? */ + run1 = next1 && (P(compare)(kprev1, kin1) LESS 0); + SWAP(kin2, kprev2, ktmp); + next2 = P(fetch_key)(in2, kin2); + run2 = next2 && (P(compare)(kprev2, kin2) LESS 0); + kout = kprev2; + } +#endif + else + { + P(copy_data)(in2, out1, kin2); + SWAP(kin2, kprev2, ktmp); + next2 = P(fetch_key)(in2, kin2); + run2 = next2 && (P(compare)(kprev2, kin2) LESS 0); + kout = kprev2; + } + if (!run1 && !run2) + { + run1 = next1; + run2 = next2; + } + } + bclose(in1); + bclose(in2); + if (sorter_trace) + log(L_INFO, "Pass %d: %d runs, %d+%d KB", sorter_pass_counter, run_count, + (out1 ? (int)((btell(out1) + 1023) / 1024) : 0), + (out2 ? (int)((btell(out2) + 1023) / 1024) : 0)); + if (out1) /* FIXME: What about empty output? */ + { + bflush(out1); + bsetpos(out1, 0); + } + if (out2) + { + bflush(out2); + bsetpos(out2, 0); + } + *fb1 = out1; + *fb2 = out2; + sorter_pass_counter++; +} + +static +#ifdef SORT_OUTPUT_FB +struct fastbuf * +#elif defined(SORT_OUTPUT_FILE) +void +#else +#error No output defined. +#endif +P(sort)( +#ifdef SORT_INPUT_FILE +byte *inname +#elif defined(SORT_INPUT_FB) +struct fastbuf *fb1 +#elif defined(SORT_INPUT_FBPAIR) +struct fastbuf *fb1, struct fastbuf *fb2 +#else +#error No input defined. +#endif +#ifdef SORT_OUTPUT_FILE +,byte *outname +#endif +) +{ +#ifdef SORT_INPUT_FILE + struct fastbuf *fb1, *fb2; + fb1 = bopen(inname, O_RDONLY, sorter_stream_bufsize); +#ifdef SORT_DELETE_INPUT + fb1->is_temp_file = SORT_DELETE_INPUT; +#endif + fb2 = NULL; +#elif defined(SORT_INPUT_FB) + struct fastbuf *fb2 = NULL; +#endif + + sorter_pass_counter = 1; + do P(pass)(&fb1, &fb2); while (fb1 && fb2); + if (!fb1) + fb1 = fb2; + fb1->is_temp_file = 0; + +#ifdef SORT_OUTPUT_FB + return fb1; +#else + if (rename(fb1->name, outname) < 0) + die("rename(%s,%s): %m", fb1->name, outname); +#endif +} + +#undef P +#undef LESS +#undef SWAP + +#endif /* !SORT_DECLARE_ONLY */