]> mj.ucw.cz Git - libucw.git/commitdiff
First version of the sorter. No presorting phase yet.
authorMartin Mares <mj@ucw.cz>
Sun, 4 Feb 2001 14:44:54 +0000 (14:44 +0000)
committerMartin Mares <mj@ucw.cz>
Sun, 4 Feb 2001 14:44:54 +0000 (14:44 +0000)
lib/Makefile
lib/sort-test.c [new file with mode: 0644]
lib/sorter.c [new file with mode: 0644]
lib/sorter.h [new file with mode: 0644]

index 918f34bb329e3e965399d9d93dc94c8cecaa58e0..5be7ccef985f0a3ca5f7113772b0525546190390 100644 (file)
@@ -1,12 +1,12 @@
-# Makefile for the Sherlock Library (c) 1997--2000 Martin Mares <mj@ucw.cz>
+# Makefile for the Sherlock Library (c) 1997--2001 Martin Mares <mj@ucw.cz>
 
 DIRS+=lib
-PROGS+=obj/lib/db-test obj/lib/db-rebuild obj/lib/buckettool obj/lib/conf-test
+PROGS+=obj/lib/db-test obj/lib/db-rebuild obj/lib/buckettool
 
 SHLIB_OBJS=alloc.o alloc_str.o ctmatch.o db.o fastbuf.o fb-file.o fb-mem.o lists.o \
        log.o log2.o md5.o md5hex.o mmap.o pagecache.o patimatch.o patmatch.o pool.o \
        prime.o random.o realloc.o regex.o temp.o timer.o url.o wildmatch.o \
-       wordsplit.o str_ctype.o str_upper.o bucket.o conf.o object.o
+       wordsplit.o str_ctype.o str_upper.o bucket.o conf.o object.o sorter.o
 
 obj/lib/libsh.a: $(addprefix obj/lib/,$(SHLIB_OBJS))
 
@@ -14,3 +14,4 @@ obj/lib/db-test: obj/lib/db-test.o obj/lib/libsh.a
 obj/lib/db-rebuild: obj/lib/db-rebuild.o obj/lib/libsh.a
 obj/lib/buckettool: obj/lib/buckettool.o obj/lib/libsh.a
 obj/lib/conf-test: obj/lib/conf-test.o obj/lib/libsh.a
+obj/lib/sort-test: obj/lib/sort-test.o obj/lib/libsh.a
diff --git a/lib/sort-test.c b/lib/sort-test.c
new file mode 100644 (file)
index 0000000..2f61b15
--- /dev/null
@@ -0,0 +1,50 @@
+/* Test for sorting routines */
+
+#include "lib/lib.h"
+#include "lib/conf.h"
+#include "lib/fastbuf.h"
+
+#include <stdio.h>
+#include <string.h>
+
+struct key {
+  char line[1024];
+};
+
+#define SORT_KEY struct key
+#define SORT_PREFIX(x) s_##x
+#define SORT_INPUT_FILE
+#define SORT_OUTPUT_FILE
+
+static inline int
+s_compare(struct key *a, struct key *b)
+{
+  return strcmp(a->line, b->line);
+}
+
+static inline int
+s_fetch_key(struct fastbuf *f, struct key *a)
+{
+  return !!bgets(f, a->line, sizeof(a->line));
+}
+
+static inline void
+s_copy_data(struct fastbuf *src UNUSED, struct fastbuf *dest, struct key *k)
+{
+  bputsn(dest, k->line);
+}
+
+#include "lib/sorter.h"
+
+int
+main(int argc, char **argv)
+{
+  log_init(NULL);
+  cf_read(DEFAULT_CONFIG);
+  if (cf_getopt(argc, argv, CF_SHORT_OPTS, CF_NO_LONG_OPTS, NULL) >= 0 ||
+      optind != argc - 2)
+    die("Usage: sort-test <input> <output>");
+
+  s_sort(argv[optind], argv[optind+1]);
+  return 0;
+}
diff --git a/lib/sorter.c b/lib/sorter.c
new file mode 100644 (file)
index 0000000..b43672f
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+ *     Sherlock Library -- Universal Sorter
+ *
+ *     (c) 2001 Martin Mares <mj@ucw.cz>
+ */
+
+#include "lib/lib.h"
+#include "lib/conf.h"
+#include "lib/fastbuf.h"
+
+#include <unistd.h>
+#include <sys/fcntl.h>
+
+#define SORT_DECLARE_ONLY
+#include "lib/sorter.h"
+
+uns sorter_trace;
+uns sorter_presort_bufsize = 65536;
+uns sorter_stream_bufsize = 65536;
+static byte *sorter_template = "/tmp/sort%d.%d";
+
+static struct cfitem sorter_config[] = {
+  { "Sorter",          CT_SECTION,     NULL },
+  { "Trace",           CT_INT,         &sorter_trace },
+  { "PresortBuffer",   CT_INT,         &sorter_presort_bufsize },
+  { "StreamBuffer",    CT_INT,         &sorter_stream_bufsize },
+  { "TempLate",                CT_STRING,      &sorter_template },
+  { NULL,              CT_STOP,        NULL }
+};
+
+static void CONSTRUCTOR sorter_init_config(void)
+{
+  cf_register(sorter_config);
+}
+
+uns sorter_pass_counter;
+uns sorter_file_counter;
+
+struct fastbuf *
+sorter_open_tmp(void)
+{
+  byte buf[256];
+  struct fastbuf *f;
+
+  sprintf(buf, sorter_template, (int) getpid(), sorter_file_counter++);
+  f = bopen(buf, O_RDWR | O_CREAT | O_EXCL, sorter_stream_bufsize);
+  f->is_temp_file = 1;
+  return f;
+}
diff --git a/lib/sorter.h b/lib/sorter.h
new file mode 100644 (file)
index 0000000..0ed5e04
--- /dev/null
@@ -0,0 +1,245 @@
+/*
+ *     Sherlock Library -- Universal Sorter
+ *
+ *     (c) 2001 Martin Mares <mj@ucw.cz>
+ */
+
+/*
+ *  This is not a normal header file, it's a generator of sorting
+ *  routines.  Each time you include it with parameters set in the
+ *  corresponding preprocessor macros, it generates a file sorter
+ *  with the parameters given.
+ *
+ *  Recognized parameter macros: (those marked with [*] are mandatory)
+ *
+ *  SORT_KEY       [*] data type capable of storing a single key
+ *  SORT_PREFIX(x)  [*] add a name prefix (used on all global names
+ *                     defined by the sorter)
+ *  SORT_PRESORT       include an in-core presorting pass
+ *  SORT_UNIFY         merge items with identical keys
+ *  SORT_DELETE_INPUT  a C expression, if true, the input files are
+ *                     deleted as soon as possible
+ *  SORT_INPUT_FILE    input is a file with this name
+ *  SORT_INPUT_FB      input is a fastbuf stream
+ *  SORT_INPUT_FBPAIR  input is a pair of fastbuf streams
+ *                     (not supported by the presorter)
+ *  SORT_OUTPUT_FILE   output is a file with this name
+ *  SORT_OUTPUT_FB     output is a fastbuf stream
+ *
+ *  You also need to define some (usually inline) functions which
+ *  are called by the sorter to process your data:
+ *
+ *  int PREFIX_compare(SORT_KEY *a, *b)
+ *                     compare two keys, result like strcmp
+ *  int PREFIX_fetch_key(struct fastbuf *f, SORT_KEY *k)
+ *                     fetch next key, returns 1=ok, 0=eof
+ *  void PREFIX_copy_data(struct fastbuf *src, *dest, SORT_KEY *k)
+ *                     write just fetched key k to dest and copy all data
+ *                     belonging to this key from src to dest.
+ *  void PREFIX_merge_data(struct fastbuf *src1, *src2, *dest, SORT_KEY *k1, *k2)
+ *                     [used only in case SORT_UNIFY is defined]
+ *                     write just fetched key k to dest and merge data from
+ *                     two records with the same key (k1 and k2 are key occurences
+ *                     in the corresponding streams).
+ *  char * PREFIX_fetch_item(struct fastbuf *f, SORT_KEY *k, char *limit)
+ *                     [used only with SORT_PRESORT]
+ *                     fetch data belonging to a just fetched key and store
+ *                     them to memory following the key, but not over limit.
+ *                     Returns a pointer to first byte after the data
+ *                     or NULL if the data don't fit.
+ *                     Important: keys carrying no data must be position
+ *                     independent.
+ *  void PREFIX_store_item(struct fastbuf *f, SORT_KEY *k)
+ *                     [used only with SORT_PRESORT]
+ *                     write key and all its data read with PREFIX_fetch_data
+ *                     to the stream given.
+ *  SORT_KEY * PREFIX_merge_items(SORT_KEY *a, SORT_KEY *b)
+ *                     [used only with SORT_PRESORT && SORT_UNIFY]
+ *                     merge two items with the same key, returns pointer
+ *                     to at most one of the items, the rest will be removed
+ *                     from the list of items, but not deallocated, so
+ *                     the remaining item can freely reference data of the
+ *                     other one.
+ */
+
+/* Declarations of externals from sorter.c */
+
+#ifndef SORT_DECLS_READ
+#define SORT_DECLS_READ
+
+extern uns sorter_trace;
+extern uns sorter_presort_bufsize;
+extern uns sorter_stream_bufsize;
+
+extern uns sorter_pass_counter, sorter_file_counter;
+struct fastbuf *sorter_open_tmp(void);
+
+#endif         /* !SORT_DECLS_READ */
+
+/* The sorter proper */
+
+#ifndef SORT_DECLARE_ONLY
+
+#include "lib/fastbuf.h"
+#include <unistd.h>
+#include <fcntl.h>
+
+#if !defined(SORT_KEY) || !defined(SORT_PREFIX)
+#error Some of the mandatory configuration macros are missing.
+#endif
+
+#define P(x) SORT_PREFIX(x)
+#define SWAP(x,y,z) do { z=x; x=y; y=z; } while(0)
+
+#if defined(SORT_UNIFY) || defined(SORT_UNIQUE)
+#define LESS <
+#else
+#define LESS <=
+#endif
+
+static void
+P(pass)(struct fastbuf **fb1, struct fastbuf **fb2)
+{
+  struct fastbuf *in1 = *fb1;
+  struct fastbuf *in2 = *fb2;
+  struct fastbuf *out1 = NULL;
+  struct fastbuf *out2 = NULL;
+  SORT_KEY kbuf1, kbuf2, kbuf3, kbuf4;
+  SORT_KEY *kin1 = &kbuf1;
+  SORT_KEY *kprev1 = &kbuf2;
+  SORT_KEY *kin2 = &kbuf3;
+  SORT_KEY *kprev2 = &kbuf4;
+  SORT_KEY *kout = NULL;
+  SORT_KEY *ktmp;
+  int next1, next2, comp;
+  int run1, run2;
+  uns run_count = 0;
+
+  run1 = next1 = in1 ? P(fetch_key)(in1, kin1) : 0;
+  run2 = next2 = in2 ? P(fetch_key)(in2, kin2) : 0;
+  while (next1 || next2)
+    {
+      if (!run1)
+       comp = 1;
+      else if (!run2)
+       comp = -1;
+      else
+       comp = P(compare)(kin1, kin2);
+      ktmp = (comp <= 0) ? kin1 : kin2;
+      if (!kout || !(P(compare)(kout, ktmp) LESS 0))
+       {
+         struct fastbuf *t;
+         SWAP(out1, out2, t);
+         if (!out1)
+           out1 = sorter_open_tmp();
+         run_count++;
+       }
+      if (comp LESS 0)
+       {
+         P(copy_data)(in1, out1, kin1);
+         SWAP(kin1, kprev1, ktmp);
+         next1 = P(fetch_key)(in1, kin1);
+         run1 = next1 && (P(compare)(kprev1, kin1) LESS 0);
+         kout = kprev1;
+       }
+#ifdef SORT_UNIFY
+      else if (comp == 0)
+       {
+         P(merge_data)(in1, in2, out1, kin1, kin2);
+         SWAP(kin1, kprev1, ktmp);
+         next1 = P(fetch_key)(in1, kin1); /* FIXME: Re-use other code? */
+         run1 = next1 && (P(compare)(kprev1, kin1) LESS 0);
+         SWAP(kin2, kprev2, ktmp);
+         next2 = P(fetch_key)(in2, kin2);
+         run2 = next2 && (P(compare)(kprev2, kin2) LESS 0);
+         kout = kprev2;
+       }
+#endif
+      else
+       {
+         P(copy_data)(in2, out1, kin2);
+         SWAP(kin2, kprev2, ktmp);
+         next2 = P(fetch_key)(in2, kin2);
+         run2 = next2 && (P(compare)(kprev2, kin2) LESS 0);
+         kout = kprev2;
+       }
+      if (!run1 && !run2)
+       {
+         run1 = next1;
+         run2 = next2;
+       }
+    }
+  bclose(in1);
+  bclose(in2);
+  if (sorter_trace)
+    log(L_INFO, "Pass %d: %d runs, %d+%d KB", sorter_pass_counter, run_count,
+       (out1 ? (int)((btell(out1) + 1023) / 1024) : 0),
+       (out2 ? (int)((btell(out2) + 1023) / 1024) : 0));
+  if (out1)                            /* FIXME: What about empty output? */
+    {
+      bflush(out1);
+      bsetpos(out1, 0);
+    }
+  if (out2)
+    {
+      bflush(out2);
+      bsetpos(out2, 0);
+    }
+  *fb1 = out1;
+  *fb2 = out2;
+  sorter_pass_counter++;
+}
+
+static
+#ifdef SORT_OUTPUT_FB
+struct fastbuf *
+#elif defined(SORT_OUTPUT_FILE)
+void
+#else
+#error No output defined.
+#endif
+P(sort)(
+#ifdef SORT_INPUT_FILE
+byte *inname
+#elif defined(SORT_INPUT_FB)
+struct fastbuf *fb1
+#elif defined(SORT_INPUT_FBPAIR)
+struct fastbuf *fb1, struct fastbuf *fb2
+#else
+#error No input defined.
+#endif
+#ifdef SORT_OUTPUT_FILE
+,byte *outname
+#endif
+)
+{
+#ifdef SORT_INPUT_FILE
+  struct fastbuf *fb1, *fb2;
+  fb1 = bopen(inname, O_RDONLY, sorter_stream_bufsize);
+#ifdef SORT_DELETE_INPUT
+  fb1->is_temp_file = SORT_DELETE_INPUT;
+#endif
+  fb2 = NULL;
+#elif defined(SORT_INPUT_FB)
+  struct fastbuf *fb2 = NULL;
+#endif
+
+  sorter_pass_counter = 1;
+  do P(pass)(&fb1, &fb2); while (fb1 && fb2);
+  if (!fb1)
+    fb1 = fb2;
+  fb1->is_temp_file = 0;
+
+#ifdef SORT_OUTPUT_FB
+  return fb1;
+#else
+  if (rename(fb1->name, outname) < 0)
+    die("rename(%s,%s): %m", fb1->name, outname);
+#endif
+}
+
+#undef P
+#undef LESS
+#undef SWAP
+
+#endif         /* !SORT_DECLARE_ONLY */