Merge with git+ssh://cvs.ucw.cz/projects/sherlock/GIT/sherlock.git#v3.11

[libucw.git] / lib / sorter / s-multiway.h
diff --git a/lib/sorter/s-multiway.h b/lib/sorter/s-multiway.h

index a4352fe96c819401ff0bd7ce1ce150b7ee92d569..83e928f2621c6a8ba07adeb244b0b3ad63a5a5e3 100644 (file)
--- a/lib/sorter/s-multiway.h
+++ b/lib/sorter/s-multiway.h
@@ -7,72 +7,141 @@
   *     of the GNU Lesser General Public License.
   */
  
-static inline void P(update_tree)(P(key) *keys, int *tree, uns i)
+/*
+ * We use a binary tree to keep track of the current minimum. The tree is
+ * represented by an array (in the same way as binary heaps usually are),
+ * leaves correspond to input streams and each internal vertex remembers
+ * the leaf in its subtree, which has the lowest key.
+ */
+
+typedef struct P(mwt) {
+  int i;               // Minimum of the subtree
+#ifdef SORT_UNIFY
+  int eq;              // Did we encounter equality anywhere in the subtree?
+#endif
+} P(mwt);
+
+static inline void P(update_tree)(P(key) *keys, P(mwt) *tree, uns i)
  {
    while (i /= 2)
      {
-      if (tree[2*i] < 0)
+      if (tree[2*i].i < 0)
         tree[i] = tree[2*i+1];
-      else if (tree[2*i+1] < 0)
-       tree[i] = tree[2*i];
-      else if (P(compare)(&keys[tree[2*i]], &keys[tree[2*i+1]]) <= 0)
+      else if (tree[2*i+1].i < 0)
         tree[i] = tree[2*i];
        else
-       tree[i] = tree[2*i+1];
+       {
+         int cmp = P(compare)(&keys[tree[2*i].i], &keys[tree[2*i+1].i]);
+         tree[i] = (cmp <= 0) ? tree[2*i] : tree[2*i+1];
+#ifdef SORT_UNIFY
+         if (!cmp)
+           tree[i].eq = 1;
+#endif
+       }
+      /*
+       * It is very tempting to stop as soon as the current node does not
+       * change, but it is wrong, because even if the stream index stored in
+       * the tree is the same, the actual key value can differ.
+       */
      }
+  /*
+   * This function sometimes triggers optimizer bugs in GCC versions up to 4.2.1,
+   * leading to an assumption that tree[1] does not change during this function.
+   * We add an explicit memory barrier as a work-around. Ugh. See GCC Bug #33262.
+   */
+  asm volatile ("" : : : "memory");
  }
  
-static void P(multiway_merge)(struct sort_context *ctx UNUSED, struct sort_bucket **ins, uns num_ins, struct sort_bucket *out)
+static inline void P(set_tree)(P(key) *keys, P(mwt) *tree, uns i, int val)
  {
+  tree[i].i = val;
+  P(update_tree)(keys, tree, i);
+}
+
+static void P(multiway_merge)(struct sort_context *ctx UNUSED, struct sort_bucket **ins, struct sort_bucket *out)
+{
+  uns num_ins = 0;
+  while (ins[num_ins])
+    num_ins++;
+
    uns n2 = 1;
    while (n2 < num_ins)
      n2 *= 2;
  
    struct fastbuf *fout = sbuck_write(out);
    struct fastbuf *fins[num_ins];
-  P(key) keys[num_ins];                // FIXME: Tune num_ins according to L1 cache size
-  int tree[2*n2];              // A complete binary tree, leaves are input streams, each internal vertex contains a minimum of its sons
-  for (uns i=1; i<=n2; i++)
-    tree[i] = -1;
+  P(key) keys[num_ins];
+  P(mwt) tree[2*n2];
+  for (uns i=1; i<2*n2; i++)
+    tree[i] = (P(mwt)) { .i = -1 };
  
    for (uns i=0; i<num_ins; i++)
      {
        fins[i] = sbuck_read(ins[i]);
        if (P(read_key)(fins[i], &keys[i]))
+       P(set_tree)(keys, tree, n2+i, i);
+    }
+
+#ifdef SORT_UNIFY
+
+  uns hits[num_ins];
+  P(key) *mkeys[num_ins], *key;
+  struct fastbuf *mfb[num_ins];
+
+  while (likely(tree[1].i >= 0))
+    {
+      int i = tree[1].i;
+      if (!tree[1].eq)
         {
-         tree[n2+i] = i;
+         /* The key is unique, so let's go through the fast path */
+         P(copy_data)(&keys[i], fins[i], fout);
+         if (unlikely(!P(read_key)(fins[i], &keys[i])))
+           tree[n2+i].i = -1;
           P(update_tree)(keys, tree, n2+i);
+         continue;
+       }
+
+      uns m = 0;
+      key = &keys[i];
+      do
+       {
+         hits[m] = i;
+         mkeys[m] = &keys[i];
+         mfb[m] = fins[i];
+         m++;
+         P(set_tree)(keys, tree, n2+i, -1);
+         i = tree[1].i;
+         if (unlikely(i < 0))
+           break;
+       }
+      while (!P(compare)(key, &keys[i]));
+
+      P(copy_merged)(mkeys, mfb, m, fout);
+
+      for (uns j=0; j<m; j++)
+       {
+         i = hits[j];
+         if (likely(P(read_key)(fins[i], &keys[i])))
+           P(set_tree)(keys, tree, n2+i, i);
         }
      }
  
-  while (tree[1] >= 0)
+#else
+
+  /* Simplified version which does not support any unification */
+  while (likely(tree[1].i >= 0))
      {
-      uns i = tree[1];
+      uns i = tree[1].i;
+      P(key) UNUSED key = keys[i];
        P(copy_data)(&keys[i], fins[i], fout);
        if (unlikely(!P(read_key)(fins[i], &keys[i])))
-       tree[n2+i] = -1;
+       tree[n2+i].i = -1;
        P(update_tree)(keys, tree, n2+i);
-    }
-
-#if 0
  #ifdef SORT_ASSERT_UNIQUE
-  ASSERT(comp != 0);
+      ASSERT(tree[1].i < 0 || P(compare)(&key, &keys[tree[1].i]) < 0);
  #endif
-#ifdef SORT_UNIFY
-  if (comp == 0)
-    {
-      P(key) *mkeys[] = { kin1, kin2 };
-      struct fastbuf *mfb[] = { fin1, fin2 };
-      P(copy_merged)(mkeys, mfb, 2, fout1);
-      SWAP(kin1, kprev1, ktmp);
-      next1 = P(read_key)(fin1, kin1);
-      run1 = next1 && (P(compare)(kprev1, kin1) LESS 0);
-      SWAP(kin2, kprev2, ktmp);
-      next2 = P(read_key)(fin2, kin2);
-      run2 = next2 && (P(compare)(kprev2, kin2) LESS 0);
-      kout = kprev2;
      }
-#endif
+
  #endif
  
    out->runs++;