2 * Experiments with various sorting algorithms
4 * (c) 2007--2008 Martin Mares <mj@ucw.cz>
7 #include "sherlock/sherlock.h"
8 #include "ucw/getopt.h"
23 static struct elt *ary, *alt, **ind, *array0, *array1;
24 static uns n = 10000000;
27 static struct elt *alloc_elts(uns n)
29 return big_alloc(n * sizeof(struct elt));
32 static void free_elts(struct elt *a, uns n)
34 big_free(a, n * sizeof(struct elt));
37 static int comp(const void *x, const void *y)
39 const struct elt *xx = x, *yy = y;
40 return (xx->key < yy->key) ? -1 : (xx->key > yy->key) ? 1 : 0;
43 static int comp_ind(const void *x, const void *y)
45 const struct elt * const *xx = x, * const *yy = y;
46 return comp(*xx, *yy);
49 #define ASORT_PREFIX(x) as_##x
50 #define ASORT_KEY_TYPE u32
51 #define ASORT_ELT(i) a[i].key
52 #define ASORT_SWAP(i,j) do { struct elt t=a[i]; a[i]=a[j]; a[j]=t; } while (0)
53 #define ASORT_EXTRA_ARGS , struct elt *a
54 #include "ucw/sorter/array-simple.h"
56 #define ASORT_PREFIX(x) asi_##x
57 #define ASORT_KEY_TYPE u32
58 #define ASORT_ELT(i) ind[i]->key
59 #define ASORT_SWAP(i,j) do { struct elt *t=ind[i]; ind[i]=ind[j]; ind[j]=t; } while (0)
60 #include "ucw/sorter/array-simple.h"
62 static void r1_sort(void)
64 struct elt *from = ary, *to = alt, *tmp;
67 for (uns sh=0; sh<32; sh+=BITS)
69 bzero(cnt, sizeof(cnt));
70 for (uns i=0; i<n; i++)
71 cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++;
73 for (uns i=0; i<(1<<BITS); i++)
80 for (uns i=0; i<n; i++)
81 to[cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++] = from[i];
82 ASSERT(cnt[(1 << BITS)-1] == n);
83 tmp=from, from=to, to=tmp;
89 static void r1b_sort(void)
91 struct elt *from = ary, *to = alt, *tmp;
93 uns cnt[1 << BITS], cnt2[1 << BITS];
94 for (uns sh=0; sh<32; sh+=BITS)
97 memcpy(cnt, cnt2, sizeof(cnt));
100 bzero(cnt, sizeof(cnt));
101 for (uns i=0; i<n; i++)
102 cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++;
105 for (uns i=0; i<(1<<BITS); i++)
112 bzero(cnt2, sizeof(cnt2));
113 for (uns i=0; i<n; i++)
115 cnt2[(from[i].key >> (sh + BITS)) & ((1 << BITS) - 1)]++;
116 to[cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++] = from[i];
118 ASSERT(cnt[(1 << BITS)-1] == n);
119 tmp=from, from=to, to=tmp;
125 static void r1c_sort(void)
128 struct elt *ptrs[256], *x, *lim;
130 x = ary; lim = ary + n;
131 bzero(cnt, sizeof(cnt));
133 cnt[x++->key & 255]++;
135 #define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
138 x = ary; lim = ary + n;
139 bzero(cnt, sizeof(cnt));
142 cnt[(x->key >> 8) & 255]++;
143 *ptrs[x->key & 255]++ = *x;
148 x = alt; lim = alt + n;
149 bzero(cnt, sizeof(cnt));
152 cnt[(x->key >> 16) & 255]++;
153 *ptrs[(x->key >> 8) & 255]++ = *x;
158 x = ary; lim = ary + n;
159 bzero(cnt, sizeof(cnt));
162 cnt[(x->key >> 24) & 255]++;
163 *ptrs[(x->key >> 16) & 255]++ = *x;
168 x = alt; lim = alt + n;
171 *ptrs[(x->key >> 24) & 255]++ = *x;
177 #include <emmintrin.h>
179 static inline void sse_copy_elt(struct elt *to, struct elt *from)
181 __m128i m = _mm_load_si128((__m128i *) from);
182 _mm_store_si128((__m128i *) to, m);
185 static void r1c_sse_sort(void)
188 struct elt *ptrs[256], *x, *lim;
190 ASSERT(sizeof(struct elt) == 16);
191 ASSERT(!((uintptr_t)alt & 15));
192 ASSERT(!((uintptr_t)ary & 15));
194 x = ary; lim = ary + n;
195 bzero(cnt, sizeof(cnt));
197 cnt[x++->key & 255]++;
199 #define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
202 x = ary; lim = ary + n;
203 bzero(cnt, sizeof(cnt));
206 cnt[(x->key >> 8) & 255]++;
207 sse_copy_elt(ptrs[x->key & 255]++, x);
212 x = alt; lim = alt + n;
213 bzero(cnt, sizeof(cnt));
216 cnt[(x->key >> 16) & 255]++;
217 sse_copy_elt(ptrs[(x->key >> 8) & 255]++, x);
222 x = ary; lim = ary + n;
223 bzero(cnt, sizeof(cnt));
226 cnt[(x->key >> 24) & 255]++;
227 sse_copy_elt(ptrs[(x->key >> 16) & 255]++, x);
232 x = alt; lim = alt + n;
235 sse_copy_elt(ptrs[(x->key >> 24) & 255]++, x);
241 static void r1d_sort(void)
244 struct elt *ptrs[256], *x, *y, *lim;
248 x = ary; lim = ary + n;
249 bzero(cnt, sizeof(cnt));
252 cnt[x++->key & 255]++;
253 cnt[x++->key & 255]++;
254 cnt[x++->key & 255]++;
255 cnt[x++->key & 255]++;
258 #define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
261 x = ary; y = ary+n/2; lim = ary + n/2;
262 bzero(cnt, sizeof(cnt));
265 cnt[(x->key >> 8) & 255]++;
266 cnt[(y->key >> 8) & 255]++;
267 *ptrs[x->key & 255]++ = *x;
268 *ptrs[y->key & 255]++ = *y;
270 cnt[(x->key >> 8) & 255]++;
271 cnt[(y->key >> 8) & 255]++;
272 *ptrs[x->key & 255]++ = *x;
273 *ptrs[y->key & 255]++ = *y;
278 x = alt; lim = alt + n;
279 bzero(cnt, sizeof(cnt));
282 cnt[(x->key >> 16) & 255]++;
283 *ptrs[(x->key >> 8) & 255]++ = *x;
285 cnt[(x->key >> 16) & 255]++;
286 *ptrs[(x->key >> 8) & 255]++ = *x;
291 x = ary; lim = ary + n;
292 bzero(cnt, sizeof(cnt));
295 cnt[(x->key >> 24) & 255]++;
296 *ptrs[(x->key >> 16) & 255]++ = *x;
298 cnt[(x->key >> 24) & 255]++;
299 *ptrs[(x->key >> 16) & 255]++ = *x;
304 x = alt; lim = alt + n;
307 *ptrs[(x->key >> 24) & 255]++ = *x;
309 *ptrs[(x->key >> 24) & 255]++ = *x;
315 static void r2_sort(void)
317 struct elt *from = ary, *to = alt;
320 bzero(cnt, sizeof(cnt));
321 for (uns i=0; i<n; i++)
322 cnt[(from[i].key >> (32 - BITS)) & ((1 << BITS) - 1)]++;
324 for (uns i=0; i<(1<<BITS); i++)
331 for (uns i=0; i<n; i++)
332 to[cnt[(from[i].key >> (32 - BITS)) & ((1 << BITS) - 1)]++] = from[i];
333 ASSERT(cnt[(1 << BITS)-1] == n);
336 for (uns i=0; i<(1 << BITS); i++)
338 as_sort(cnt[i] - pos, alt+pos);
345 static void r3_sort(void)
349 #define BUCKS (1 << BITS)
350 #define THRESHOLD 5000
353 auto void r3(struct elt *from, struct elt *to, uns n, uns lev);
354 void r3(struct elt *from, struct elt *to, uns n, uns lev)
356 uns sh = 32 - lev*BITS;
358 bzero(cnt, sizeof(cnt));
359 for (uns i=0; i<n; i++)
360 cnt[(from[i].key >> sh) & (BUCKS - 1)]++;
362 for (uns i=0; i<BUCKS; i++)
369 for (uns i=0; i<n; i++)
371 to[cnt[(from[i].key >> sh) & (BUCKS - 1)]++] = from[i];
373 sse_copy_elt(&to[cnt[(from[i].key >> sh) & (BUCKS - 1)]++], &from[i]);
376 for (uns i=0; i<BUCKS; i++)
379 if (lev >= LEVELS || l <= THRESHOLD)
382 if ((lev % 2) != ODDEVEN)
383 memcpy(from+pos, to+pos, l * sizeof(struct elt));
386 r3(to+pos, from+pos, l, lev+1);
402 static inline struct elt *mrg(struct elt *x, struct elt *xl, struct elt *y, struct elt *yl, struct elt *z)
406 if (x->key <= y->key)
431 static void mergesort(void)
433 struct elt *from, *to;
437 struct elt *x = ary, *z = alt, *last = ary + (n & ~1U);
440 if (x[0].key < x[1].key)
441 *z++ = *x++, *z++ = *x++;
453 for (; (1U << lev) < n; lev++)
456 from = alt, to = ary;
458 from = ary, to = alt;
459 struct elt *x, *z, *last;
464 while (x + 2*step <= last)
466 z = mrg(x, x+step, x+step, x+2*step, z);
470 mrg(x, x+step, x+step, last, z);
472 memcpy(z, x, (byte*)last - (byte*)x);
478 static void sampsort(uns n, struct elt *ar, struct elt *al, struct elt *dest, byte *wbuf)
483 bzero(cnt, sizeof(cnt));
484 for (uns i=0; i<WAYS; i++)
485 k[i] = ar[random() % n];
487 for (uns i=0; i<n; i++)
490 #define FW(delta) if (ar[i].key > k[w+delta].key) w += delta
502 struct elt *y = al, *way[WAYS], *z;
503 for (uns i=0; i<WAYS; i++)
509 for (uns i=0; i<n; i++)
516 for (uns i=0; i<WAYS; i++)
519 sampsort(cnt[i], y, z, dest, wbuf);
524 memcpy(z, y, cnt[i]*sizeof(struct elt));
533 static void samplesort(void)
535 byte *aux = xmalloc(n);
536 sampsort(n, ary, alt, ary, aux);
540 static void sampsort2(uns n, struct elt *ar, struct elt *al, struct elt *dest, byte *wbuf)
545 bzero(cnt, sizeof(cnt));
546 for (uns i=0; i<WAYS; i++)
547 k[i] = ar[random() % n];
549 struct elt *k1 = ar, *k2 = ar+1, *kend = ar+n;
554 #define FW1(delta) if (k1->key > k[w1+delta].key) w1 += delta
555 #define FW2(delta) if (k2->key > k[w2+delta].key) w2 += delta
574 FW1(128); FW1(64); FW1(32); FW1(16);
575 FW1(8); FW1(4); FW1(2); FW1(1);
579 struct elt *y = al, *way[WAYS], *z;
580 for (uns i=0; i<WAYS; i++)
586 for (uns i=0; i<n; i++)
593 for (uns i=0; i<WAYS; i++)
596 sampsort2(cnt[i], y, z, dest, wbuf);
601 memcpy(z, y, cnt[i]*sizeof(struct elt));
611 static void samplesort2(void)
613 byte *aux = xmalloc(n);
614 sampsort2(n, ary, alt, ary, aux);
618 static void heapsort(void)
620 #define H_LESS(_a,_b) ((_a).key > (_b).key)
621 struct elt *heap = ary-1;
622 HEAP_INIT(struct elt, heap, n, H_LESS, HEAP_SWAP);
625 HEAP_DELMIN(struct elt, heap, nn, H_LESS, HEAP_SWAP);
629 static void heapsort_ind(void)
631 #define H_LESS(_a,_b) ((_a)->key > (_b)->key)
632 struct elt **heap = ind-1;
633 HEAP_INIT(struct elt *, heap, n, H_LESS, HEAP_SWAP);
636 HEAP_DELMIN(struct elt *, heap, nn, H_LESS, HEAP_SWAP);
640 static void mk_ary(void)
647 bzero(block, sizeof(block));
650 for (uns i=0; i<n; i++)
656 md5_transform(ctx.buf, block);
658 ary[i].key = ctx.buf[i%4];
660 ary[i].key = i*(~0U/(n-1));
662 for (uns j=1; j<sizeof(struct elt)/4; j++)
663 ((u32*)&ary[i])[j] = ROL(ary[i].key, 3*j);
668 static void chk_ary(void)
671 for (uns i=1; i<n; i++)
672 if (ary[i].key < ary[i-1].key)
673 die("Missorted at %d", i);
680 static void mk_ind(void)
683 ind = xmalloc(sizeof(struct elt *) * n);
684 for (uns i=0; i<n; i++)
688 static void chk_ind(void)
691 for (uns i=1; i<n; i++)
692 if (ind[i]->key < ind[i-1]->key)
693 die("Missorted at %d", i);
701 int main(int argc, char **argv)
707 while ((opt = cf_getopt(argc, argv, CF_SHORT_OPTS "1", CF_NO_LONG_OPTS, NULL)) >= 0)
711 op |= (1 << (opt - '0'));
717 array0 = alloc_elts(n);
718 array1 = alloc_elts(n);
719 for (uns i=0; i<n; i++)
720 array0[i] = array1[i] = (struct elt) { 0 };
722 log(L_INFO, "Testing with %u elements", n);
727 for (uns i=0; i<5; i++)
730 memcpy(alt, ary, sizeof(struct elt) * n);
731 memcpy(ary, alt, sizeof(struct elt) * n);
733 for (uns j=0; j<n; j++)
735 for (uns j=0; j<n; j++)
739 log(L_DEBUG, "memcpy: %d", get_timer(&timer)/10);
741 #define BENCH(type, name, func) mk_##type(); init_timer(&timer); func; log(L_DEBUG, name ": %d", get_timer(&timer)); chk_##type()
743 BENCH(ary, "qsort", qsort(ary, n, sizeof(struct elt), comp));
744 BENCH(ary, "arraysort", as_sort(n, ary));
745 BENCH(ind, "indirect qsort", qsort(ind, n, sizeof(struct elt *), comp_ind));
746 BENCH(ind, "indirect arraysort", asi_sort(n));
747 BENCH(ary, "radix1", r1_sort());
748 BENCH(ary, "radix1b", r1b_sort());
749 BENCH(ary, "radix1c", r1c_sort());
750 BENCH(ary, "radix1c-sse", r1c_sse_sort());
751 BENCH(ary, "radix1d", r1d_sort());
752 BENCH(ary, "radix2", r2_sort());
753 BENCH(ary, "radix3", r3_sort());
754 BENCH(ary, "mergesort", mergesort());
755 BENCH(ary, "samplesort", samplesort());
756 BENCH(ary, "samplesort2", samplesort2());
757 BENCH(ary, "heapsort", heapsort());
758 BENCH(ind, "indirect heapsort", heapsort_ind());
760 free_elts(array0, n);
761 free_elts(array1, n);