X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=lib%2Fsorter%2Fs-multiway.h;h=83e928f2621c6a8ba07adeb244b0b3ad63a5a5e3;hb=66dfcb94d094eb496bc675234cd15967e1ebd892;hp=a4352fe96c819401ff0bd7ce1ce150b7ee92d569;hpb=5a78c3505ae7fa76a061e26676450049ec5946d5;p=libucw.git diff --git a/lib/sorter/s-multiway.h b/lib/sorter/s-multiway.h index a4352fe9..83e928f2 100644 --- a/lib/sorter/s-multiway.h +++ b/lib/sorter/s-multiway.h @@ -7,72 +7,141 @@ * of the GNU Lesser General Public License. */ -static inline void P(update_tree)(P(key) *keys, int *tree, uns i) +/* + * We use a binary tree to keep track of the current minimum. The tree is + * represented by an array (in the same way as binary heaps usually are), + * leaves correspond to input streams and each internal vertex remembers + * the leaf in its subtree, which has the lowest key. + */ + +typedef struct P(mwt) { + int i; // Minimum of the subtree +#ifdef SORT_UNIFY + int eq; // Did we encounter equality anywhere in the subtree? +#endif +} P(mwt); + +static inline void P(update_tree)(P(key) *keys, P(mwt) *tree, uns i) { while (i /= 2) { - if (tree[2*i] < 0) + if (tree[2*i].i < 0) tree[i] = tree[2*i+1]; - else if (tree[2*i+1] < 0) - tree[i] = tree[2*i]; - else if (P(compare)(&keys[tree[2*i]], &keys[tree[2*i+1]]) <= 0) + else if (tree[2*i+1].i < 0) tree[i] = tree[2*i]; else - tree[i] = tree[2*i+1]; + { + int cmp = P(compare)(&keys[tree[2*i].i], &keys[tree[2*i+1].i]); + tree[i] = (cmp <= 0) ? tree[2*i] : tree[2*i+1]; +#ifdef SORT_UNIFY + if (!cmp) + tree[i].eq = 1; +#endif + } + /* + * It is very tempting to stop as soon as the current node does not + * change, but it is wrong, because even if the stream index stored in + * the tree is the same, the actual key value can differ. + */ } + /* + * This function sometimes triggers optimizer bugs in GCC versions up to 4.2.1, + * leading to an assumption that tree[1] does not change during this function. + * We add an explicit memory barrier as a work-around. Ugh. See GCC Bug #33262. + */ + asm volatile ("" : : : "memory"); } -static void P(multiway_merge)(struct sort_context *ctx UNUSED, struct sort_bucket **ins, uns num_ins, struct sort_bucket *out) +static inline void P(set_tree)(P(key) *keys, P(mwt) *tree, uns i, int val) { + tree[i].i = val; + P(update_tree)(keys, tree, i); +} + +static void P(multiway_merge)(struct sort_context *ctx UNUSED, struct sort_bucket **ins, struct sort_bucket *out) +{ + uns num_ins = 0; + while (ins[num_ins]) + num_ins++; + uns n2 = 1; while (n2 < num_ins) n2 *= 2; struct fastbuf *fout = sbuck_write(out); struct fastbuf *fins[num_ins]; - P(key) keys[num_ins]; // FIXME: Tune num_ins according to L1 cache size - int tree[2*n2]; // A complete binary tree, leaves are input streams, each internal vertex contains a minimum of its sons - for (uns i=1; i<=n2; i++) - tree[i] = -1; + P(key) keys[num_ins]; + P(mwt) tree[2*n2]; + for (uns i=1; i<2*n2; i++) + tree[i] = (P(mwt)) { .i = -1 }; for (uns i=0; i= 0)) + { + int i = tree[1].i; + if (!tree[1].eq) { - tree[n2+i] = i; + /* The key is unique, so let's go through the fast path */ + P(copy_data)(&keys[i], fins[i], fout); + if (unlikely(!P(read_key)(fins[i], &keys[i]))) + tree[n2+i].i = -1; P(update_tree)(keys, tree, n2+i); + continue; + } + + uns m = 0; + key = &keys[i]; + do + { + hits[m] = i; + mkeys[m] = &keys[i]; + mfb[m] = fins[i]; + m++; + P(set_tree)(keys, tree, n2+i, -1); + i = tree[1].i; + if (unlikely(i < 0)) + break; + } + while (!P(compare)(key, &keys[i])); + + P(copy_merged)(mkeys, mfb, m, fout); + + for (uns j=0; j= 0) +#else + + /* Simplified version which does not support any unification */ + while (likely(tree[1].i >= 0)) { - uns i = tree[1]; + uns i = tree[1].i; + P(key) UNUSED key = keys[i]; P(copy_data)(&keys[i], fins[i], fout); if (unlikely(!P(read_key)(fins[i], &keys[i]))) - tree[n2+i] = -1; + tree[n2+i].i = -1; P(update_tree)(keys, tree, n2+i); - } - -#if 0 #ifdef SORT_ASSERT_UNIQUE - ASSERT(comp != 0); + ASSERT(tree[1].i < 0 || P(compare)(&key, &keys[tree[1].i]) < 0); #endif -#ifdef SORT_UNIFY - if (comp == 0) - { - P(key) *mkeys[] = { kin1, kin2 }; - struct fastbuf *mfb[] = { fin1, fin2 }; - P(copy_merged)(mkeys, mfb, 2, fout1); - SWAP(kin1, kprev1, ktmp); - next1 = P(read_key)(fin1, kin1); - run1 = next1 && (P(compare)(kprev1, kin1) LESS 0); - SWAP(kin2, kprev2, ktmp); - next2 = P(read_key)(fin2, kin2); - run2 = next2 && (P(compare)(kprev2, kin2) LESS 0); - kout = kprev2; } -#endif + #endif out->runs++;