2 * Experiments with various sorting algorithms
4 * (c) 2007--2008 Martin Mares <mj@ucw.cz>
8 #include <ucw/getopt.h>
24 static struct elt *ary, *alt, **ind, *array0, *array1;
25 static uns n = 10000000;
28 static struct elt *alloc_elts(uns n)
30 return big_alloc(n * sizeof(struct elt));
33 static void free_elts(struct elt *a, uns n)
35 big_free(a, n * sizeof(struct elt));
38 static int comp(const void *x, const void *y)
40 const struct elt *xx = x, *yy = y;
41 return (xx->key < yy->key) ? -1 : (xx->key > yy->key) ? 1 : 0;
44 static int comp_ind(const void *x, const void *y)
46 const struct elt * const *xx = x, * const *yy = y;
47 return comp(*xx, *yy);
50 #define ASORT_PREFIX(x) as_##x
51 #define ASORT_KEY_TYPE u32
52 #define ASORT_ELT(i) a[i].key
53 #define ASORT_SWAP(i,j) do { struct elt t=a[i]; a[i]=a[j]; a[j]=t; } while (0)
54 #define ASORT_EXTRA_ARGS , struct elt *a
55 #include <ucw/sorter/array-simple.h>
57 #define ASORT_PREFIX(x) asi_##x
58 #define ASORT_KEY_TYPE u32
59 #define ASORT_ELT(i) ind[i]->key
60 #define ASORT_SWAP(i,j) do { struct elt *t=ind[i]; ind[i]=ind[j]; ind[j]=t; } while (0)
61 #include <ucw/sorter/array-simple.h>
63 static void r1_sort(void)
65 struct elt *from = ary, *to = alt, *tmp;
68 for (uns sh=0; sh<32; sh+=BITS)
70 bzero(cnt, sizeof(cnt));
71 for (uns i=0; i<n; i++)
72 cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++;
74 for (uns i=0; i<(1<<BITS); i++)
81 for (uns i=0; i<n; i++)
82 to[cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++] = from[i];
83 ASSERT(cnt[(1 << BITS)-1] == n);
84 tmp=from, from=to, to=tmp;
90 static void r1b_sort(void)
92 struct elt *from = ary, *to = alt, *tmp;
94 uns cnt[1 << BITS], cnt2[1 << BITS];
95 for (uns sh=0; sh<32; sh+=BITS)
98 memcpy(cnt, cnt2, sizeof(cnt));
101 bzero(cnt, sizeof(cnt));
102 for (uns i=0; i<n; i++)
103 cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++;
106 for (uns i=0; i<(1<<BITS); i++)
113 bzero(cnt2, sizeof(cnt2));
114 for (uns i=0; i<n; i++)
116 cnt2[(from[i].key >> (sh + BITS)) & ((1 << BITS) - 1)]++;
117 to[cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++] = from[i];
119 ASSERT(cnt[(1 << BITS)-1] == n);
120 tmp=from, from=to, to=tmp;
126 static void r1c_sort(void)
129 struct elt *ptrs[256], *x, *lim;
131 x = ary; lim = ary + n;
132 bzero(cnt, sizeof(cnt));
134 cnt[x++->key & 255]++;
136 #define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
139 x = ary; lim = ary + n;
140 bzero(cnt, sizeof(cnt));
143 cnt[(x->key >> 8) & 255]++;
144 *ptrs[x->key & 255]++ = *x;
149 x = alt; lim = alt + n;
150 bzero(cnt, sizeof(cnt));
153 cnt[(x->key >> 16) & 255]++;
154 *ptrs[(x->key >> 8) & 255]++ = *x;
159 x = ary; lim = ary + n;
160 bzero(cnt, sizeof(cnt));
163 cnt[(x->key >> 24) & 255]++;
164 *ptrs[(x->key >> 16) & 255]++ = *x;
169 x = alt; lim = alt + n;
172 *ptrs[(x->key >> 24) & 255]++ = *x;
178 #include <emmintrin.h>
180 static inline void sse_copy_elt(struct elt *to, struct elt *from)
182 __m128i m = _mm_load_si128((__m128i *) from);
183 _mm_store_si128((__m128i *) to, m);
186 static void r1c_sse_sort(void)
189 struct elt *ptrs[256], *x, *lim;
191 ASSERT(sizeof(struct elt) == 16);
192 ASSERT(!((uintptr_t)alt & 15));
193 ASSERT(!((uintptr_t)ary & 15));
195 x = ary; lim = ary + n;
196 bzero(cnt, sizeof(cnt));
198 cnt[x++->key & 255]++;
200 #define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
203 x = ary; lim = ary + n;
204 bzero(cnt, sizeof(cnt));
207 cnt[(x->key >> 8) & 255]++;
208 sse_copy_elt(ptrs[x->key & 255]++, x);
213 x = alt; lim = alt + n;
214 bzero(cnt, sizeof(cnt));
217 cnt[(x->key >> 16) & 255]++;
218 sse_copy_elt(ptrs[(x->key >> 8) & 255]++, x);
223 x = ary; lim = ary + n;
224 bzero(cnt, sizeof(cnt));
227 cnt[(x->key >> 24) & 255]++;
228 sse_copy_elt(ptrs[(x->key >> 16) & 255]++, x);
233 x = alt; lim = alt + n;
236 sse_copy_elt(ptrs[(x->key >> 24) & 255]++, x);
242 static void r1d_sort(void)
245 struct elt *ptrs[256], *x, *y, *lim;
249 x = ary; lim = ary + n;
250 bzero(cnt, sizeof(cnt));
253 cnt[x++->key & 255]++;
254 cnt[x++->key & 255]++;
255 cnt[x++->key & 255]++;
256 cnt[x++->key & 255]++;
259 #define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
262 x = ary; y = ary+n/2; lim = ary + n/2;
263 bzero(cnt, sizeof(cnt));
266 cnt[(x->key >> 8) & 255]++;
267 cnt[(y->key >> 8) & 255]++;
268 *ptrs[x->key & 255]++ = *x;
269 *ptrs[y->key & 255]++ = *y;
271 cnt[(x->key >> 8) & 255]++;
272 cnt[(y->key >> 8) & 255]++;
273 *ptrs[x->key & 255]++ = *x;
274 *ptrs[y->key & 255]++ = *y;
279 x = alt; lim = alt + n;
280 bzero(cnt, sizeof(cnt));
283 cnt[(x->key >> 16) & 255]++;
284 *ptrs[(x->key >> 8) & 255]++ = *x;
286 cnt[(x->key >> 16) & 255]++;
287 *ptrs[(x->key >> 8) & 255]++ = *x;
292 x = ary; lim = ary + n;
293 bzero(cnt, sizeof(cnt));
296 cnt[(x->key >> 24) & 255]++;
297 *ptrs[(x->key >> 16) & 255]++ = *x;
299 cnt[(x->key >> 24) & 255]++;
300 *ptrs[(x->key >> 16) & 255]++ = *x;
305 x = alt; lim = alt + n;
308 *ptrs[(x->key >> 24) & 255]++ = *x;
310 *ptrs[(x->key >> 24) & 255]++ = *x;
316 static void r2_sort(void)
318 struct elt *from = ary, *to = alt;
321 bzero(cnt, sizeof(cnt));
322 for (uns i=0; i<n; i++)
323 cnt[(from[i].key >> (32 - BITS)) & ((1 << BITS) - 1)]++;
325 for (uns i=0; i<(1<<BITS); i++)
332 for (uns i=0; i<n; i++)
333 to[cnt[(from[i].key >> (32 - BITS)) & ((1 << BITS) - 1)]++] = from[i];
334 ASSERT(cnt[(1 << BITS)-1] == n);
337 for (uns i=0; i<(1 << BITS); i++)
339 as_sort(cnt[i] - pos, alt+pos);
346 static void r3_sort(void)
350 #define BUCKS (1 << BITS)
351 #define THRESHOLD 5000
354 auto void r3(struct elt *from, struct elt *to, uns n, uns lev);
355 void r3(struct elt *from, struct elt *to, uns n, uns lev)
357 uns sh = 32 - lev*BITS;
359 bzero(cnt, sizeof(cnt));
360 for (uns i=0; i<n; i++)
361 cnt[(from[i].key >> sh) & (BUCKS - 1)]++;
363 for (uns i=0; i<BUCKS; i++)
370 for (uns i=0; i<n; i++)
372 to[cnt[(from[i].key >> sh) & (BUCKS - 1)]++] = from[i];
374 sse_copy_elt(&to[cnt[(from[i].key >> sh) & (BUCKS - 1)]++], &from[i]);
377 for (uns i=0; i<BUCKS; i++)
380 if (lev >= LEVELS || l <= THRESHOLD)
383 if ((lev % 2) != ODDEVEN)
384 memcpy(from+pos, to+pos, l * sizeof(struct elt));
387 r3(to+pos, from+pos, l, lev+1);
403 static inline struct elt *mrg(struct elt *x, struct elt *xl, struct elt *y, struct elt *yl, struct elt *z)
407 if (x->key <= y->key)
432 static void mergesort(void)
434 struct elt *from, *to;
438 struct elt *x = ary, *z = alt, *last = ary + (n & ~1U);
441 if (x[0].key < x[1].key)
442 *z++ = *x++, *z++ = *x++;
454 for (; (1U << lev) < n; lev++)
457 from = alt, to = ary;
459 from = ary, to = alt;
460 struct elt *x, *z, *last;
465 while (x + 2*step <= last)
467 z = mrg(x, x+step, x+step, x+2*step, z);
471 mrg(x, x+step, x+step, last, z);
473 memcpy(z, x, (byte*)last - (byte*)x);
479 static void sampsort(uns n, struct elt *ar, struct elt *al, struct elt *dest, byte *wbuf)
484 bzero(cnt, sizeof(cnt));
485 for (uns i=0; i<WAYS; i++)
486 k[i] = ar[random() % n];
488 for (uns i=0; i<n; i++)
491 #define FW(delta) if (ar[i].key > k[w+delta].key) w += delta
503 struct elt *y = al, *way[WAYS], *z;
504 for (uns i=0; i<WAYS; i++)
510 for (uns i=0; i<n; i++)
517 for (uns i=0; i<WAYS; i++)
520 sampsort(cnt[i], y, z, dest, wbuf);
525 memcpy(z, y, cnt[i]*sizeof(struct elt));
534 static void samplesort(void)
536 byte *aux = xmalloc(n);
537 sampsort(n, ary, alt, ary, aux);
541 static void sampsort2(uns n, struct elt *ar, struct elt *al, struct elt *dest, byte *wbuf)
546 bzero(cnt, sizeof(cnt));
547 for (uns i=0; i<WAYS; i++)
548 k[i] = ar[random() % n];
550 struct elt *k1 = ar, *k2 = ar+1, *kend = ar+n;
555 #define FW1(delta) if (k1->key > k[w1+delta].key) w1 += delta
556 #define FW2(delta) if (k2->key > k[w2+delta].key) w2 += delta
575 FW1(128); FW1(64); FW1(32); FW1(16);
576 FW1(8); FW1(4); FW1(2); FW1(1);
580 struct elt *y = al, *way[WAYS], *z;
581 for (uns i=0; i<WAYS; i++)
587 for (uns i=0; i<n; i++)
594 for (uns i=0; i<WAYS; i++)
597 sampsort2(cnt[i], y, z, dest, wbuf);
602 memcpy(z, y, cnt[i]*sizeof(struct elt));
612 static void samplesort2(void)
614 byte *aux = xmalloc(n);
615 sampsort2(n, ary, alt, ary, aux);
619 static void heapsort(void)
621 #define H_LESS(_a,_b) ((_a).key > (_b).key)
622 struct elt *heap = ary-1;
623 HEAP_INIT(struct elt, heap, n, H_LESS, HEAP_SWAP);
626 HEAP_DELETE_MIN(struct elt, heap, nn, H_LESS, HEAP_SWAP);
630 static void heapsort_ind(void)
632 #define H_LESS(_a,_b) ((_a)->key > (_b)->key)
633 struct elt **heap = ind-1;
634 HEAP_INIT(struct elt *, heap, n, H_LESS, HEAP_SWAP);
637 HEAP_DELETE_MIN(struct elt *, heap, nn, H_LESS, HEAP_SWAP);
641 static void mk_ary(void)
648 bzero(block, sizeof(block));
651 for (uns i=0; i<n; i++)
657 md5_transform(ctx.buf, block);
659 ary[i].key = ctx.buf[i%4];
661 ary[i].key = i*(~0U/(n-1));
663 for (uns j=1; j<sizeof(struct elt)/4; j++)
664 ((u32*)&ary[i])[j] = ROL(ary[i].key, 3*j);
669 static void chk_ary(void)
672 for (uns i=1; i<n; i++)
673 if (ary[i].key < ary[i-1].key)
674 die("Missorted at %d", i);
681 static void mk_ind(void)
684 ind = xmalloc(sizeof(struct elt *) * n);
685 for (uns i=0; i<n; i++)
689 static void chk_ind(void)
692 for (uns i=1; i<n; i++)
693 if (ind[i]->key < ind[i-1]->key)
694 die("Missorted at %d", i);
702 int main(int argc, char **argv)
708 while ((opt = cf_getopt(argc, argv, CF_SHORT_OPTS "1", CF_NO_LONG_OPTS, NULL)) >= 0)
712 op |= (1 << (opt - '0'));
718 array0 = alloc_elts(n);
719 array1 = alloc_elts(n);
720 for (uns i=0; i<n; i++)
721 array0[i] = array1[i] = (struct elt) { 0 };
723 msg(L_INFO, "Testing with %u elements", n);
728 for (uns i=0; i<5; i++)
731 memcpy(alt, ary, sizeof(struct elt) * n);
732 memcpy(ary, alt, sizeof(struct elt) * n);
734 for (uns j=0; j<n; j++)
736 for (uns j=0; j<n; j++)
740 msg(L_DEBUG, "memcpy: %d", get_timer(&timer)/10);
742 #define BENCH(type, name, func) mk_##type(); init_timer(&timer); func; msg(L_DEBUG, name ": %d", get_timer(&timer)); chk_##type()
744 BENCH(ary, "qsort", qsort(ary, n, sizeof(struct elt), comp));
745 BENCH(ary, "arraysort", as_sort(n, ary));
746 BENCH(ind, "indirect qsort", qsort(ind, n, sizeof(struct elt *), comp_ind));
747 BENCH(ind, "indirect arraysort", asi_sort(n));
748 BENCH(ary, "radix1", r1_sort());
749 BENCH(ary, "radix1b", r1b_sort());
750 BENCH(ary, "radix1c", r1c_sort());
751 BENCH(ary, "radix1c-sse", r1c_sse_sort());
752 BENCH(ary, "radix1d", r1d_sort());
753 BENCH(ary, "radix2", r2_sort());
754 BENCH(ary, "radix3", r3_sort());
755 BENCH(ary, "mergesort", mergesort());
756 BENCH(ary, "samplesort", samplesort());
757 BENCH(ary, "samplesort2", samplesort2());
758 BENCH(ary, "heapsort", heapsort());
759 BENCH(ind, "indirect heapsort", heapsort_ind());
761 free_elts(array0, n);
762 free_elts(array1, n);