2 * An experiment with sorting algorithms
5 #include "sherlock/sherlock.h"
6 #include "lib/getopt.h"
20 static struct elt *ary, *alt, **ind, *array0, *array1;
21 static uns n = 10000000;
24 static struct elt *alloc_elts(uns n)
26 return big_alloc(n * sizeof(struct elt));
29 static void free_elts(struct elt *a, uns n)
31 big_free(a, n * sizeof(struct elt));
34 static int comp(const void *x, const void *y)
36 const struct elt *xx = x, *yy = y;
37 return (xx->key < yy->key) ? -1 : (xx->key > yy->key) ? 1 : 0;
40 static int comp_ind(const void *x, const void *y)
42 const struct elt * const *xx = x, * const *yy = y;
43 return comp(*xx, *yy);
46 #define ASORT_PREFIX(x) as_##x
47 #define ASORT_KEY_TYPE u32
48 #define ASORT_ELT(i) a[i].key
49 #define ASORT_SWAP(i,j) do { struct elt t=a[i]; a[i]=a[j]; a[j]=t; } while (0)
50 #define ASORT_EXTRA_ARGS , struct elt *a
51 #include "lib/arraysort.h"
53 #define ASORT_PREFIX(x) asi_##x
54 #define ASORT_KEY_TYPE u32
55 #define ASORT_ELT(i) ind[i]->key
56 #define ASORT_SWAP(i,j) do { struct elt *t=ind[i]; ind[i]=ind[j]; ind[j]=t; } while (0)
57 #include "lib/arraysort.h"
59 static void r1_sort(void)
61 struct elt *from = ary, *to = alt, *tmp;
64 for (uns sh=0; sh<32; sh+=BITS)
66 bzero(cnt, sizeof(cnt));
67 for (uns i=0; i<n; i++)
68 cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++;
70 for (uns i=0; i<(1<<BITS); i++)
77 for (uns i=0; i<n; i++)
78 to[cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++] = from[i];
79 ASSERT(cnt[(1 << BITS)-1] == n);
80 tmp=from, from=to, to=tmp;
86 static void r1b_sort(void)
88 struct elt *from = ary, *to = alt, *tmp;
90 uns cnt[1 << BITS], cnt2[1 << BITS];
91 for (uns sh=0; sh<32; sh+=BITS)
94 memcpy(cnt, cnt2, sizeof(cnt));
97 bzero(cnt, sizeof(cnt));
98 for (uns i=0; i<n; i++)
99 cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++;
102 for (uns i=0; i<(1<<BITS); i++)
109 bzero(cnt2, sizeof(cnt2));
110 for (uns i=0; i<n; i++)
112 cnt2[(from[i].key >> (sh + BITS)) & ((1 << BITS) - 1)]++;
113 to[cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++] = from[i];
115 ASSERT(cnt[(1 << BITS)-1] == n);
116 tmp=from, from=to, to=tmp;
122 static void r1c_sort(void)
125 struct elt *ptrs[256], *x, *lim;
127 x = ary; lim = ary + n;
128 bzero(cnt, sizeof(cnt));
130 cnt[x++->key & 255]++;
132 #define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
135 x = ary; lim = ary + n;
136 bzero(cnt, sizeof(cnt));
139 cnt[(x->key >> 8) & 255]++;
140 *ptrs[x->key & 255]++ = *x;
145 x = alt; lim = alt + n;
146 bzero(cnt, sizeof(cnt));
149 cnt[(x->key >> 16) & 255]++;
150 *ptrs[(x->key >> 8) & 255]++ = *x;
155 x = ary; lim = ary + n;
156 bzero(cnt, sizeof(cnt));
159 cnt[(x->key >> 24) & 255]++;
160 *ptrs[(x->key >> 16) & 255]++ = *x;
165 x = alt; lim = alt + n;
168 *ptrs[(x->key >> 24) & 255]++ = *x;
174 #include <emmintrin.h>
176 static inline void sse_copy_elt(struct elt *to, struct elt *from)
178 __m128i m = _mm_load_si128((__m128i *) from);
179 _mm_store_si128((__m128i *) to, m);
182 static void r1c_sse_sort(void)
185 struct elt *ptrs[256], *x, *lim;
187 ASSERT(sizeof(struct elt) == 16);
188 ASSERT(!((addr_int_t)alt & 15));
189 ASSERT(!((addr_int_t)ary & 15));
191 x = ary; lim = ary + n;
192 bzero(cnt, sizeof(cnt));
194 cnt[x++->key & 255]++;
196 #define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
199 x = ary; lim = ary + n;
200 bzero(cnt, sizeof(cnt));
203 cnt[(x->key >> 8) & 255]++;
204 sse_copy_elt(ptrs[x->key & 255]++, x);
209 x = alt; lim = alt + n;
210 bzero(cnt, sizeof(cnt));
213 cnt[(x->key >> 16) & 255]++;
214 sse_copy_elt(ptrs[(x->key >> 8) & 255]++, x);
219 x = ary; lim = ary + n;
220 bzero(cnt, sizeof(cnt));
223 cnt[(x->key >> 24) & 255]++;
224 sse_copy_elt(ptrs[(x->key >> 16) & 255]++, x);
229 x = alt; lim = alt + n;
232 sse_copy_elt(ptrs[(x->key >> 24) & 255]++, x);
238 static void r1d_sort(void)
241 struct elt *ptrs[256], *x, *y, *lim;
245 x = ary; lim = ary + n;
246 bzero(cnt, sizeof(cnt));
249 cnt[x++->key & 255]++;
250 cnt[x++->key & 255]++;
251 cnt[x++->key & 255]++;
252 cnt[x++->key & 255]++;
255 #define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
258 x = ary; y = ary+n/2; lim = ary + n/2;
259 bzero(cnt, sizeof(cnt));
262 cnt[(x->key >> 8) & 255]++;
263 cnt[(y->key >> 8) & 255]++;
264 *ptrs[x->key & 255]++ = *x;
265 *ptrs[y->key & 255]++ = *y;
267 cnt[(x->key >> 8) & 255]++;
268 cnt[(y->key >> 8) & 255]++;
269 *ptrs[x->key & 255]++ = *x;
270 *ptrs[y->key & 255]++ = *y;
275 x = alt; lim = alt + n;
276 bzero(cnt, sizeof(cnt));
279 cnt[(x->key >> 16) & 255]++;
280 *ptrs[(x->key >> 8) & 255]++ = *x;
282 cnt[(x->key >> 16) & 255]++;
283 *ptrs[(x->key >> 8) & 255]++ = *x;
288 x = ary; lim = ary + n;
289 bzero(cnt, sizeof(cnt));
292 cnt[(x->key >> 24) & 255]++;
293 *ptrs[(x->key >> 16) & 255]++ = *x;
295 cnt[(x->key >> 24) & 255]++;
296 *ptrs[(x->key >> 16) & 255]++ = *x;
301 x = alt; lim = alt + n;
304 *ptrs[(x->key >> 24) & 255]++ = *x;
306 *ptrs[(x->key >> 24) & 255]++ = *x;
312 static void r2_sort(void)
314 struct elt *from = ary, *to = alt;
317 bzero(cnt, sizeof(cnt));
318 for (uns i=0; i<n; i++)
319 cnt[(from[i].key >> (32 - BITS)) & ((1 << BITS) - 1)]++;
321 for (uns i=0; i<(1<<BITS); i++)
328 for (uns i=0; i<n; i++)
329 to[cnt[(from[i].key >> (32 - BITS)) & ((1 << BITS) - 1)]++] = from[i];
330 ASSERT(cnt[(1 << BITS)-1] == n);
333 for (uns i=0; i<(1 << BITS); i++)
335 as_sort(cnt[i] - pos, alt+pos);
342 static void r3_sort(void)
346 #define BUCKS (1 << BITS)
347 #define THRESHOLD 5000
350 auto void r3(struct elt *from, struct elt *to, uns n, uns lev);
351 void r3(struct elt *from, struct elt *to, uns n, uns lev)
353 uns sh = 32 - lev*BITS;
355 bzero(cnt, sizeof(cnt));
356 for (uns i=0; i<n; i++)
357 cnt[(from[i].key >> sh) & (BUCKS - 1)]++;
359 for (uns i=0; i<BUCKS; i++)
366 for (uns i=0; i<n; i++)
368 to[cnt[(from[i].key >> sh) & (BUCKS - 1)]++] = from[i];
370 sse_copy_elt(&to[cnt[(from[i].key >> sh) & (BUCKS - 1)]++], &from[i]);
373 for (uns i=0; i<BUCKS; i++)
376 if (lev >= LEVELS || l <= THRESHOLD)
379 if ((lev % 2) != ODDEVEN)
380 memcpy(from+pos, to+pos, l * sizeof(struct elt));
383 r3(to+pos, from+pos, l, lev+1);
399 static inline struct elt *mrg(struct elt *x, struct elt *xl, struct elt *y, struct elt *yl, struct elt *z)
403 if (x->key <= y->key)
428 static void mergesort(void)
430 struct elt *from, *to;
434 struct elt *x = ary, *z = alt, *last = ary + (n & ~1U);
437 if (x[0].key < x[1].key)
438 *z++ = *x++, *z++ = *x++;
450 for (; (1U << lev) < n; lev++)
453 from = alt, to = ary;
455 from = ary, to = alt;
456 struct elt *x, *z, *last;
461 while (x + 2*step <= last)
463 z = mrg(x, x+step, x+step, x+2*step, z);
467 mrg(x, x+step, x+step, last, z);
469 memcpy(z, x, (byte*)last - (byte*)x);
475 static void sampsort(uns n, struct elt *ar, struct elt *al, struct elt *dest, byte *wbuf)
480 bzero(cnt, sizeof(cnt));
481 for (uns i=0; i<WAYS; i++)
482 k[i] = ar[random() % n];
484 for (uns i=0; i<n; i++)
487 #define FW(delta) if (ar[i].key > k[w+delta].key) w += delta
499 struct elt *y = al, *way[WAYS], *z;
500 for (uns i=0; i<WAYS; i++)
506 for (uns i=0; i<n; i++)
513 for (uns i=0; i<WAYS; i++)
516 sampsort(cnt[i], y, z, dest, wbuf);
521 memcpy(z, y, cnt[i]*sizeof(struct elt));
530 static void samplesort(void)
532 byte *aux = xmalloc(n);
533 sampsort(n, ary, alt, ary, aux);
537 static void sampsort2(uns n, struct elt *ar, struct elt *al, struct elt *dest, byte *wbuf)
542 bzero(cnt, sizeof(cnt));
543 for (uns i=0; i<WAYS; i++)
544 k[i] = ar[random() % n];
546 struct elt *k1 = ar, *k2 = ar+1, *kend = ar+n;
551 #define FW1(delta) if (k1->key > k[w1+delta].key) w1 += delta
552 #define FW2(delta) if (k2->key > k[w2+delta].key) w2 += delta
571 FW1(128); FW1(64); FW1(32); FW1(16);
572 FW1(8); FW1(4); FW1(2); FW1(1);
576 struct elt *y = al, *way[WAYS], *z;
577 for (uns i=0; i<WAYS; i++)
583 for (uns i=0; i<n; i++)
590 for (uns i=0; i<WAYS; i++)
593 sampsort2(cnt[i], y, z, dest, wbuf);
598 memcpy(z, y, cnt[i]*sizeof(struct elt));
608 static void samplesort2(void)
610 byte *aux = xmalloc(n);
611 sampsort2(n, ary, alt, ary, aux);
615 static void mk_ary(void)
619 struct MD5Context ctx;
622 bzero(block, sizeof(block));
625 for (uns i=0; i<n; i++)
631 MD5Transform(ctx.buf, block);
633 ary[i].key = ctx.buf[i%4];
635 ary[i].key = i*(~0U/(n-1));
637 for (uns j=1; j<sizeof(struct elt)/4; j++)
638 ((u32*)&ary[i])[j] = ROL(ary[i].key, 3*j);
643 static void chk_ary(void)
646 for (uns i=1; i<n; i++)
647 if (ary[i].key < ary[i-1].key)
648 die("Missorted at %d", i);
655 static void mk_ind(void)
658 ind = xmalloc(sizeof(struct elt *) * n);
659 for (uns i=0; i<n; i++)
663 static void chk_ind(void)
666 for (uns i=1; i<n; i++)
667 if (ind[i]->key < ind[i-1]->key)
668 die("Missorted at %d", i);
676 int main(int argc, char **argv)
682 while ((opt = cf_getopt(argc, argv, CF_SHORT_OPTS "1", CF_NO_LONG_OPTS, NULL)) >= 0)
686 op |= (1 << (opt - '0'));
692 array0 = alloc_elts(n);
693 array1 = alloc_elts(n);
694 for (uns i=0; i<n; i++)
695 array0[i] = array1[i] = (struct elt) { 0 };
699 for (uns i=0; i<5; i++)
702 memcpy(alt, ary, sizeof(struct elt) * n);
703 memcpy(ary, alt, sizeof(struct elt) * n);
705 for (uns j=0; j<n; j++)
707 for (uns j=0; j<n; j++)
711 log(L_DEBUG, "memcpy: %d", get_timer()/10);
713 #define BENCH(type, name, func) mk_##type(); init_timer(); func; log(L_DEBUG, name ": %d", get_timer()); chk_##type()
715 //BENCH(ary, "qsort", qsort(ary, n, sizeof(struct elt), comp));
716 //BENCH(ary, "arraysort", as_sort(n, ary));
717 //BENCH(ind, "indirect qsort", qsort(ind, n, sizeof(struct elt *), comp_ind));
718 //BENCH(ind, "indirect arraysort", asi_sort(n));
719 //BENCH(ary, "radix1", r1_sort());
720 //BENCH(ary, "radix1b", r1b_sort());
721 BENCH(ary, "radix1c", r1c_sort());
722 //BENCH(ary, "radix1c-sse", r1c_sse_sort());
723 //BENCH(ary, "radix1d", r1d_sort());
724 //BENCH(ary, "radix2", r2_sort());
725 BENCH(ary, "radix3", r3_sort());
726 //BENCH(ary, "mergesort", mergesort());
727 //BENCH(ary, "samplesort", samplesort());
728 //BENCH(ary, "samplesort2", samplesort2());
730 free_elts(array0, n);
731 free_elts(array1, n);