2 * An experiment with sorting algorithms
5 #include "sherlock/sherlock.h"
6 #include "lib/getopt.h"
20 static struct elt *ary, *alt, **ind, *array0, *array1;
21 static uns n = 10000000;
24 static struct elt *alloc_elts(uns n)
27 return xmalloc(n * sizeof(struct elt));
29 uns len = ALIGN_TO(n * sizeof(struct elt), PAGE_SIZE);
30 void *p = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
31 ASSERT(p != MAP_FAILED);
36 static void free_elts(struct elt *a, uns n)
42 uns len = ALIGN_TO(n * sizeof(struct elt), PAGE_SIZE);
47 static int comp(const void *x, const void *y)
49 const struct elt *xx = x, *yy = y;
50 return (xx->key < yy->key) ? -1 : (xx->key > yy->key) ? 1 : 0;
53 static int comp_ind(const void *x, const void *y)
55 const struct elt * const *xx = x, * const *yy = y;
56 return comp(*xx, *yy);
59 #define ASORT_PREFIX(x) as_##x
60 #define ASORT_KEY_TYPE u32
61 #define ASORT_ELT(i) a[i].key
62 #define ASORT_SWAP(i,j) do { struct elt t=a[i]; a[i]=a[j]; a[j]=t; } while (0)
63 #define ASORT_EXTRA_ARGS , struct elt *a
64 #include "lib/arraysort.h"
66 #define ASORT_PREFIX(x) asi_##x
67 #define ASORT_KEY_TYPE u32
68 #define ASORT_ELT(i) ind[i]->key
69 #define ASORT_SWAP(i,j) do { struct elt *t=ind[i]; ind[i]=ind[j]; ind[j]=t; } while (0)
70 #include "lib/arraysort.h"
72 static void r1_sort(void)
74 struct elt *from = ary, *to = alt, *tmp;
77 for (uns sh=0; sh<32; sh+=BITS)
79 bzero(cnt, sizeof(cnt));
80 for (uns i=0; i<n; i++)
81 cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++;
83 for (uns i=0; i<(1<<BITS); i++)
90 for (uns i=0; i<n; i++)
91 to[cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++] = from[i];
92 ASSERT(cnt[(1 << BITS)-1] == n);
93 tmp=from, from=to, to=tmp;
99 static void r1b_sort(void)
101 struct elt *from = ary, *to = alt, *tmp;
103 uns cnt[1 << BITS], cnt2[1 << BITS];
104 for (uns sh=0; sh<32; sh+=BITS)
107 memcpy(cnt, cnt2, sizeof(cnt));
110 bzero(cnt, sizeof(cnt));
111 for (uns i=0; i<n; i++)
112 cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++;
115 for (uns i=0; i<(1<<BITS); i++)
122 bzero(cnt2, sizeof(cnt2));
123 for (uns i=0; i<n; i++)
125 cnt2[(from[i].key >> (sh + BITS)) & ((1 << BITS) - 1)]++;
126 to[cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++] = from[i];
128 ASSERT(cnt[(1 << BITS)-1] == n);
129 tmp=from, from=to, to=tmp;
135 static void r1c_sort(void)
138 struct elt *ptrs[256], *x, *lim;
140 x = ary; lim = ary + n;
141 bzero(cnt, sizeof(cnt));
143 cnt[x++->key & 255]++;
145 #define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
148 x = ary; lim = ary + n;
149 bzero(cnt, sizeof(cnt));
152 cnt[(x->key >> 8) & 255]++;
153 *ptrs[x->key & 255]++ = *x;
158 x = alt; lim = alt + n;
159 bzero(cnt, sizeof(cnt));
162 cnt[(x->key >> 16) & 255]++;
163 *ptrs[(x->key >> 8) & 255]++ = *x;
168 x = ary; lim = ary + n;
169 bzero(cnt, sizeof(cnt));
172 cnt[(x->key >> 24) & 255]++;
173 *ptrs[(x->key >> 16) & 255]++ = *x;
178 x = alt; lim = alt + n;
181 *ptrs[(x->key >> 24) & 255]++ = *x;
187 #include <emmintrin.h>
189 static inline void sse_copy_elt(struct elt *to, struct elt *from)
191 __m128i m = _mm_load_si128((__m128i *) from);
192 _mm_store_si128((__m128i *) to, m);
195 static void r1c_sse_sort(void)
198 struct elt *ptrs[256], *x, *lim;
200 ASSERT(sizeof(struct elt) == 16);
201 ASSERT(!((addr_int_t)alt & 15));
202 ASSERT(!((addr_int_t)ary & 15));
204 x = ary; lim = ary + n;
205 bzero(cnt, sizeof(cnt));
207 cnt[x++->key & 255]++;
209 #define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
212 x = ary; lim = ary + n;
213 bzero(cnt, sizeof(cnt));
216 cnt[(x->key >> 8) & 255]++;
217 sse_copy_elt(ptrs[x->key & 255]++, x);
222 x = alt; lim = alt + n;
223 bzero(cnt, sizeof(cnt));
226 cnt[(x->key >> 16) & 255]++;
227 sse_copy_elt(ptrs[(x->key >> 8) & 255]++, x);
232 x = ary; lim = ary + n;
233 bzero(cnt, sizeof(cnt));
236 cnt[(x->key >> 24) & 255]++;
237 sse_copy_elt(ptrs[(x->key >> 16) & 255]++, x);
242 x = alt; lim = alt + n;
245 sse_copy_elt(ptrs[(x->key >> 24) & 255]++, x);
251 static void r1d_sort(void)
254 struct elt *ptrs[256], *x, *y, *lim;
258 x = ary; lim = ary + n;
259 bzero(cnt, sizeof(cnt));
262 cnt[x++->key & 255]++;
263 cnt[x++->key & 255]++;
264 cnt[x++->key & 255]++;
265 cnt[x++->key & 255]++;
268 #define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
271 x = ary; y = ary+n/2; lim = ary + n/2;
272 bzero(cnt, sizeof(cnt));
275 cnt[(x->key >> 8) & 255]++;
276 cnt[(y->key >> 8) & 255]++;
277 *ptrs[x->key & 255]++ = *x;
278 *ptrs[y->key & 255]++ = *y;
280 cnt[(x->key >> 8) & 255]++;
281 cnt[(y->key >> 8) & 255]++;
282 *ptrs[x->key & 255]++ = *x;
283 *ptrs[y->key & 255]++ = *y;
288 x = alt; lim = alt + n;
289 bzero(cnt, sizeof(cnt));
292 cnt[(x->key >> 16) & 255]++;
293 *ptrs[(x->key >> 8) & 255]++ = *x;
295 cnt[(x->key >> 16) & 255]++;
296 *ptrs[(x->key >> 8) & 255]++ = *x;
301 x = ary; lim = ary + n;
302 bzero(cnt, sizeof(cnt));
305 cnt[(x->key >> 24) & 255]++;
306 *ptrs[(x->key >> 16) & 255]++ = *x;
308 cnt[(x->key >> 24) & 255]++;
309 *ptrs[(x->key >> 16) & 255]++ = *x;
314 x = alt; lim = alt + n;
317 *ptrs[(x->key >> 24) & 255]++ = *x;
319 *ptrs[(x->key >> 24) & 255]++ = *x;
325 static void r2_sort(void)
327 struct elt *from = ary, *to = alt;
330 bzero(cnt, sizeof(cnt));
331 for (uns i=0; i<n; i++)
332 cnt[(from[i].key >> (32 - BITS)) & ((1 << BITS) - 1)]++;
334 for (uns i=0; i<(1<<BITS); i++)
341 for (uns i=0; i<n; i++)
342 to[cnt[(from[i].key >> (32 - BITS)) & ((1 << BITS) - 1)]++] = from[i];
343 ASSERT(cnt[(1 << BITS)-1] == n);
346 for (uns i=0; i<(1 << BITS); i++)
348 as_sort(cnt[i] - pos, alt+pos);
355 static void r3_sort(void)
359 #define BUCKS (1 << BITS)
360 #define THRESHOLD 5000
363 auto void r3(struct elt *from, struct elt *to, uns n, uns lev);
364 void r3(struct elt *from, struct elt *to, uns n, uns lev)
366 uns sh = 32 - lev*BITS;
368 bzero(cnt, sizeof(cnt));
369 for (uns i=0; i<n; i++)
370 cnt[(from[i].key >> sh) & (BUCKS - 1)]++;
372 for (uns i=0; i<BUCKS; i++)
379 for (uns i=0; i<n; i++)
381 to[cnt[(from[i].key >> sh) & (BUCKS - 1)]++] = from[i];
383 sse_copy_elt(&to[cnt[(from[i].key >> sh) & (BUCKS - 1)]++], &from[i]);
386 for (uns i=0; i<BUCKS; i++)
389 if (lev >= LEVELS || l <= THRESHOLD)
392 if ((lev % 2) != ODDEVEN)
393 memcpy(from+pos, to+pos, l * sizeof(struct elt));
396 r3(to+pos, from+pos, l, lev+1);
412 static inline struct elt *mrg(struct elt *x, struct elt *xl, struct elt *y, struct elt *yl, struct elt *z)
416 if (x->key <= y->key)
441 static void mergesort(void)
443 struct elt *from, *to;
447 struct elt *x = ary, *z = alt, *last = ary + (n & ~1U);
450 if (x[0].key < x[1].key)
451 *z++ = *x++, *z++ = *x++;
463 for (; (1U << lev) < n; lev++)
466 from = alt, to = ary;
468 from = ary, to = alt;
469 struct elt *x, *z, *last;
474 while (x + 2*step <= last)
476 z = mrg(x, x+step, x+step, x+2*step, z);
480 mrg(x, x+step, x+step, last, z);
482 memcpy(z, x, (byte*)last - (byte*)x);
488 static void sampsort(uns n, struct elt *ar, struct elt *al, struct elt *dest, byte *wbuf)
493 bzero(cnt, sizeof(cnt));
494 for (uns i=0; i<WAYS; i++)
495 k[i] = ar[random() % n];
497 for (uns i=0; i<n; i++)
500 #define FW(delta) if (ar[i].key > k[w+delta].key) w += delta
512 struct elt *y = al, *way[WAYS], *z;
513 for (uns i=0; i<WAYS; i++)
519 for (uns i=0; i<n; i++)
526 for (uns i=0; i<WAYS; i++)
529 sampsort(cnt[i], y, z, dest, wbuf);
534 memcpy(z, y, cnt[i]*sizeof(struct elt));
543 static void samplesort(void)
545 byte *aux = xmalloc(n);
546 sampsort(n, ary, alt, ary, aux);
550 static void sampsort2(uns n, struct elt *ar, struct elt *al, struct elt *dest, byte *wbuf)
555 bzero(cnt, sizeof(cnt));
556 for (uns i=0; i<WAYS; i++)
557 k[i] = ar[random() % n];
559 struct elt *k1 = ar, *k2 = ar+1, *kend = ar+n;
564 #define FW1(delta) if (k1->key > k[w1+delta].key) w1 += delta
565 #define FW2(delta) if (k2->key > k[w2+delta].key) w2 += delta
584 FW1(128); FW1(64); FW1(32); FW1(16);
585 FW1(8); FW1(4); FW1(2); FW1(1);
589 struct elt *y = al, *way[WAYS], *z;
590 for (uns i=0; i<WAYS; i++)
596 for (uns i=0; i<n; i++)
603 for (uns i=0; i<WAYS; i++)
606 sampsort2(cnt[i], y, z, dest, wbuf);
611 memcpy(z, y, cnt[i]*sizeof(struct elt));
621 static void samplesort2(void)
623 byte *aux = xmalloc(n);
624 sampsort2(n, ary, alt, ary, aux);
628 static void mk_ary(void)
632 struct MD5Context ctx;
635 bzero(block, sizeof(block));
638 for (uns i=0; i<n; i++)
644 MD5Transform(ctx.buf, block);
646 ary[i].key = ctx.buf[i%4];
648 ary[i].key = i*(~0U/(n-1));
650 for (uns j=1; j<sizeof(struct elt)/4; j++)
651 ((u32*)&ary[i])[j] = ROL(ary[i].key, 3*j);
656 static void chk_ary(void)
659 for (uns i=1; i<n; i++)
660 if (ary[i].key < ary[i-1].key)
661 die("Missorted at %d", i);
668 static void mk_ind(void)
671 ind = xmalloc(sizeof(struct elt *) * n);
672 for (uns i=0; i<n; i++)
676 static void chk_ind(void)
679 for (uns i=1; i<n; i++)
680 if (ind[i]->key < ind[i-1]->key)
681 die("Missorted at %d", i);
689 int main(int argc, char **argv)
695 while ((opt = cf_getopt(argc, argv, CF_SHORT_OPTS "1", CF_NO_LONG_OPTS, NULL)) >= 0)
699 op |= (1 << (opt - '0'));
705 array0 = alloc_elts(n);
706 array1 = alloc_elts(n);
707 for (uns i=0; i<n; i++)
708 array0[i] = array1[i] = (struct elt) { 0 };
712 for (uns i=0; i<5; i++)
715 memcpy(alt, ary, sizeof(struct elt) * n);
716 memcpy(ary, alt, sizeof(struct elt) * n);
718 for (uns j=0; j<n; j++)
720 for (uns j=0; j<n; j++)
724 log(L_DEBUG, "memcpy: %d", get_timer()/10);
726 #define BENCH(type, name, func) mk_##type(); init_timer(); func; log(L_DEBUG, name ": %d", get_timer()); chk_##type()
728 //BENCH(ary, "qsort", qsort(ary, n, sizeof(struct elt), comp));
729 //BENCH(ary, "arraysort", as_sort(n, ary));
730 //BENCH(ind, "indirect qsort", qsort(ind, n, sizeof(struct elt *), comp_ind));
731 //BENCH(ind, "indirect arraysort", asi_sort(n));
732 //BENCH(ary, "radix1", r1_sort());
733 //BENCH(ary, "radix1b", r1b_sort());
734 BENCH(ary, "radix1c", r1c_sort());
735 //BENCH(ary, "radix1c-sse", r1c_sse_sort());
736 //BENCH(ary, "radix1d", r1d_sort());
737 //BENCH(ary, "radix2", r2_sort());
738 BENCH(ary, "radix3", r3_sort());
739 //BENCH(ary, "mergesort", mergesort());
740 //BENCH(ary, "samplesort", samplesort());
741 //BENCH(ary, "samplesort2", samplesort2());
743 free_elts(array0, n);
744 free_elts(array1, n);