X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=lib%2Fsorter.h;h=ed82eb34893fad81c80350a06975d264267b56da;hb=b7b9a3d3d56430caba2f41a2e9cae8ca87dce1d6;hp=be2e80a33b12dade3daffafc9683626998d7b5c0;hpb=78d7afe64ca18c0cf679aec89d611641f5cbe872;p=libucw.git diff --git a/lib/sorter.h b/lib/sorter.h index be2e80a3..ed82eb34 100644 --- a/lib/sorter.h +++ b/lib/sorter.h @@ -1,7 +1,11 @@ /* - * Sherlock Library -- Universal Sorter + * UCW Library -- Universal Sorter * - * (c) 2001 Martin Mares + * (c) 2001--2004 Martin Mares + * (c) 2004 Robert Spalek + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. */ /* @@ -15,12 +19,26 @@ * SORT_KEY [*] data type capable of storing a single key * SORT_PREFIX(x) [*] add a name prefix (used on all global names * defined by the sorter) - * SORT_PRESORT include an in-core presorting pass + * SORT_PRESORT include an in-core pre-sorting pass. Beware, when in + * the pre-sorting mode, it's quite possible that the + * comparison function will be called with both arguments + * identical. + * SORT_UP_TO a C expression, if defined, sorting is stopped after the + * average run length in the file exceeds the value of this + * expression (in bytes) * SORT_UNIFY merge items with identical keys + * SORT_UNIQUE all items have distinct keys (checked in debug mode) + * SORT_REGULAR all items are equally long and they don't contain + * anything else than the key. In this case, the sorter + * automatically supplies fetch_key, copy_data, fetch_item + * and store_item functions. Item size is also expected + * to be small. * SORT_DELETE_INPUT a C expression, if true, the input files are * deleted as soon as possible * SORT_INPUT_FILE input is a file with this name * SORT_INPUT_FB input is a fastbuf stream + * (can be safely NULL if you want to treat original + * input in a different way by file read functions) * SORT_INPUT_FBPAIR input is a pair of fastbuf streams * (not supported by the presorter) * SORT_OUTPUT_FILE output is a file with this name @@ -32,7 +50,7 @@ * int PREFIX_compare(SORT_KEY *a, *b) * compare two keys, result like strcmp * int PREFIX_fetch_key(struct fastbuf *f, SORT_KEY *k) - * fetch next key, returns 1=ok, 0=eof + * fetch next key, returns nonzero=ok, 0=eof * void PREFIX_copy_data(struct fastbuf *src, *dest, SORT_KEY *k) * write just fetched key k to dest and copy all data * belonging to this key from src to dest. @@ -46,9 +64,11 @@ * fetch data belonging to a just fetched key and store * them to memory following the key, but not over limit. * Returns a pointer to first byte after the data - * or NULL if the data don't fit. - * Important: keys carrying no data must be position - * independent. + * or NULL if the data don't fit. For variable-length + * keys, it can use the rest of SORT_KEY and even return + * pointer before end of the key. + * Important: before PREFIX_fetch_item() succeeds, the key + * must be position independent, the sorter can copy it. * void PREFIX_store_item(struct fastbuf *f, SORT_KEY *k) * [used only with SORT_PRESORT] * write key and all its data read with PREFIX_fetch_data @@ -60,26 +80,12 @@ * from the list of items, but not deallocated, so * the remaining item can freely reference data of the * other one. + * + * After including this file, all parameter macros are automatically + * undef'd. */ -/* Declarations of externals from sorter.c */ - -#ifndef SORT_DECLS_READ -#define SORT_DECLS_READ - -extern uns sorter_trace; -extern uns sorter_presort_bufsize; -extern uns sorter_stream_bufsize; - -extern uns sorter_pass_counter, sorter_file_counter; -struct fastbuf *sorter_open_tmp(void); - -#endif /* !SORT_DECLS_READ */ - -/* The sorter proper */ - -#ifndef SORT_DECLARE_ONLY - +#include "lib/sorter-globals.h" #include "lib/fastbuf.h" #include #include @@ -98,19 +104,52 @@ struct fastbuf *sorter_open_tmp(void); #define LESS <= #endif +#if defined(SORT_UNIQUE) && defined(DEBUG_ASSERTS) +#define SORT_ASSERT_UNIQUE +#endif + +#ifdef SORT_REGULAR + +static inline int +P(fetch_key)(struct fastbuf *in, SORT_KEY *x) +{ + return breadb(in, x, sizeof(*x)); +} + +static inline void +P(copy_data)(struct fastbuf *in UNUSED, struct fastbuf *out, SORT_KEY *x) +{ + bwrite(out, x, sizeof(*x)); +} + +static inline byte * +P(fetch_item)(struct fastbuf *in UNUSED, SORT_KEY *x UNUSED, byte *limit UNUSED) +{ + return (byte *)(x+1); +} + +static inline void +P(store_item)(struct fastbuf *out, SORT_KEY *x) +{ + bwrite(out, x, sizeof(*x)); +} + +#endif + static struct fastbuf * P(flush_out)(struct fastbuf *out) { if (out) - { - bflush(out); - bsetpos(out, 0); - } + brewind(out); return out; } -static void -P(pass)(struct fastbuf **fb1, struct fastbuf **fb2) +static uns +P(pass)(struct fastbuf **fb1, struct fastbuf **fb2 +#ifdef SORT_UP_TO + , uns stop_sorting +#endif +) { struct fastbuf *in1 = *fb1; struct fastbuf *in2 = *fb2; @@ -141,9 +180,12 @@ P(pass)(struct fastbuf **fb1, struct fastbuf **fb2) if (!kout || !(P(compare)(kout, ktmp) LESS 0)) { struct fastbuf *t; - SWAP(out1, out2, t); +#ifdef SORT_UP_TO + if (!stop_sorting) +#endif + SWAP(out1, out2, t); if (!out1) - out1 = sorter_open_tmp(); + out1 = bopen_tmp(sorter_stream_bufsize); run_count++; } if (comp LESS 0) @@ -159,13 +201,17 @@ P(pass)(struct fastbuf **fb1, struct fastbuf **fb2) { P(merge_data)(in1, in2, out1, kin1, kin2); SWAP(kin1, kprev1, ktmp); - next1 = P(fetch_key)(in1, kin1); /* FIXME: Re-use other code? */ + next1 = P(fetch_key)(in1, kin1); run1 = next1 && (P(compare)(kprev1, kin1) LESS 0); SWAP(kin2, kprev2, ktmp); next2 = P(fetch_key)(in2, kin2); run2 = next2 && (P(compare)(kprev2, kin2) LESS 0); kout = kprev2; } +#endif +#ifdef SORT_ASSERT_UNIQUE + else if (unlikely(comp == 0)) + ASSERT(0); #endif else { @@ -190,10 +236,81 @@ P(pass)(struct fastbuf **fb1, struct fastbuf **fb2) *fb1 = P(flush_out)(out1); *fb2 = P(flush_out)(out2); sorter_pass_counter++; + return run_count; } #ifdef SORT_PRESORT +#if defined(SORT_REGULAR) && !defined(SORT_UNIFY) + +/* If we are doing a simple sort on a regular file, we can use a faster presorting strategy */ + +static SORT_KEY *P(array); + +#define ASORT_PREFIX(x) SORT_PREFIX(x##_array) +#define ASORT_KEY_TYPE SORT_KEY +#define ASORT_ELT(i) P(array)[i] +#define ASORT_LT(x,y) (P(compare)(&(x),&(y)) < 0) + +#include "lib/arraysort.h" + +static void +P(presort)(struct fastbuf **fb1, struct fastbuf **fb2) +{ + struct fastbuf *in = *fb1; + struct fastbuf *out1 = NULL; + struct fastbuf *out2 = NULL; + struct fastbuf *tbuf; + uns buf_items = sorter_presort_bufsize / sizeof(SORT_KEY); + uns run_count = 0; + SORT_KEY last_out = { }, *array; + + ASSERT(!*fb2); + if (buf_items < 2) + die("PresortBuffer set too low"); + P(array) = array = xmalloc(buf_items * sizeof(SORT_KEY)); + + for(;;) + { + uns s = bread(in, array, buf_items * sizeof(SORT_KEY)); + uns n = s / sizeof(SORT_KEY); + ASSERT(!(s % sizeof(SORT_KEY))); + if (!n) + break; + P(sort_array)(n); +#ifdef SORT_ASSERT_UNIQUE + for (uns i=0; i= 0)) + ASSERT(0); + ASSERT(!run_count || P(compare)(&last_out, &array[0])); +#endif + if (!run_count || P(compare)(&last_out, &array[0]) > 0) + { + run_count++; +#ifdef SORT_UP_TO + if (sorter_presort_bufsize < (uns) SORT_UP_TO) +#endif + SWAP(out1, out2, tbuf); + if (!out1) + out1 = bopen_tmp(sorter_stream_bufsize); + } + last_out = array[n-1]; + bwrite(out1, array, n * sizeof(SORT_KEY)); + } + + bclose(in); + if (sorter_trace) + log(L_INFO, "Pass 0: %d runs, %d+%d KB", + run_count, + (out1 ? (int)((btell(out1) + 1023) / 1024) : 0), + (out2 ? (int)((btell(out2) + 1023) / 1024) : 0)); + *fb1 = P(flush_out)(out1); + *fb2 = P(flush_out)(out2); + xfree(array); +} + +#else + #define SORT_NODE struct P(presort_node) SORT_NODE { @@ -267,9 +384,12 @@ P(presort)(struct fastbuf **fb1, struct fastbuf **fb2) leftover = NULL; while (cont) { - SWAP(out1, out2, tbuf); +#ifdef SORT_UP_TO + if (sorter_presort_bufsize < SORT_UP_TO) +#endif + SWAP(out1, out2, tbuf); if (!out1) - out1 = sorter_open_tmp(); + out1 = bopen_tmp(sorter_stream_bufsize); current = buffer; last = &first; if (leftover) @@ -281,7 +401,7 @@ P(presort)(struct fastbuf **fb1, struct fastbuf **fb2) } for(;;) { - current = (byte *) ALIGN((addr_int_t) current, CPU_STRUCT_ALIGN); + current = (byte *) ALIGN_TO((addr_int_t) current, CPU_STRUCT_ALIGN); if (current + sizeof(*this) > bufend) break; this = (SORT_NODE *) current; @@ -328,6 +448,9 @@ P(presort)(struct fastbuf **fb1, struct fastbuf **fb2) first = second->next; continue; } +#endif +#ifdef SORT_ASSERT_UNIQUE + ASSERT(!first->next || P(compare)(&first->key, &first->next->key)); #endif P(store_item)(out1, &first->key); first = first->next; @@ -342,8 +465,11 @@ P(presort)(struct fastbuf **fb1, struct fastbuf **fb2) (out2 ? (int)((btell(out2) + 1023) / 1024) : 0)); *fb1 = P(flush_out)(out1); *fb2 = P(flush_out)(out2); + xfree(buffer); } +#endif /* SORT_REGULAR && !SORT_UNIFY */ + #endif /* SORT_PRESORT */ static @@ -372,27 +498,47 @@ struct fastbuf *fb1, struct fastbuf *fb2 #ifdef SORT_INPUT_FILE struct fastbuf *fb1, *fb2; fb1 = bopen(inname, O_RDONLY, sorter_stream_bufsize); -#ifdef SORT_DELETE_INPUT - fb1->is_temp_file = SORT_DELETE_INPUT; -#endif fb2 = NULL; #elif defined(SORT_INPUT_FB) struct fastbuf *fb2 = NULL; #endif +#ifdef SORT_DELETE_INPUT + bconfig(fb1, BCONFIG_IS_TEMP_FILE, SORT_DELETE_INPUT); +#endif sorter_pass_counter = 1; #ifdef SORT_PRESORT P(presort)(&fb1, &fb2); if (fb2) #endif +#ifndef SORT_UP_TO do P(pass)(&fb1, &fb2); while (fb1 && fb2); +#else + { + sh_off_t run_count, max_run_count = 0; + if (fb1) + max_run_count += bfilesize(fb1); + if (fb2) + max_run_count += bfilesize(fb2); +#ifdef SORT_PRESORT + run_count = max_run_count / sorter_presort_bufsize; +#else + run_count = max_run_count; +#endif + if (SORT_UP_TO) + max_run_count /= SORT_UP_TO; + do + run_count = P(pass)(&fb1, &fb2, (run_count+1)/2 <= max_run_count); + while (fb1 && fb2); + } +#endif if (!fb1) - fb1 = sorter_open_tmp(); + fb1 = bopen_tmp(sorter_stream_bufsize); #ifdef SORT_OUTPUT_FB return fb1; #else - fb1->is_temp_file = 0; + bconfig(fb1, BCONFIG_IS_TEMP_FILE, 0); if (rename(fb1->name, outname) < 0) die("rename(%s,%s): %m", fb1->name, outname); bclose(fb1); @@ -403,5 +549,17 @@ struct fastbuf *fb1, struct fastbuf *fb2 #undef LESS #undef SWAP #undef SORT_NODE - -#endif /* !SORT_DECLARE_ONLY */ +#undef SORT_KEY +#undef SORT_PREFIX +#undef SORT_UNIFY +#undef SORT_UNIQUE +#undef SORT_ASSERT_UNIQUE +#undef SORT_REGULAR +#undef SORT_DELETE_INPUT +#undef SORT_INPUT_FILE +#undef SORT_INPUT_FB +#undef SORT_INPUT_FBPAIR +#undef SORT_OUTPUT_FILE +#undef SORT_OUTPUT_FB +#undef SORT_PRESORT +#undef SORT_UP_TO