2 * Experiments with various sorting algorithms
4 * (c) 2007 Martin Mares <mj@ucw.cz>
7 #include "sherlock/sherlock.h"
8 #include "lib/getopt.h"
22 static struct elt *ary, *alt, **ind, *array0, *array1;
23 static uns n = 10000000;
26 static struct elt *alloc_elts(uns n)
28 return big_alloc(n * sizeof(struct elt));
31 static void free_elts(struct elt *a, uns n)
33 big_free(a, n * sizeof(struct elt));
36 static int comp(const void *x, const void *y)
38 const struct elt *xx = x, *yy = y;
39 return (xx->key < yy->key) ? -1 : (xx->key > yy->key) ? 1 : 0;
42 static int comp_ind(const void *x, const void *y)
44 const struct elt * const *xx = x, * const *yy = y;
45 return comp(*xx, *yy);
48 #define ASORT_PREFIX(x) as_##x
49 #define ASORT_KEY_TYPE u32
50 #define ASORT_ELT(i) a[i].key
51 #define ASORT_SWAP(i,j) do { struct elt t=a[i]; a[i]=a[j]; a[j]=t; } while (0)
52 #define ASORT_EXTRA_ARGS , struct elt *a
53 #include "lib/arraysort.h"
55 #define ASORT_PREFIX(x) asi_##x
56 #define ASORT_KEY_TYPE u32
57 #define ASORT_ELT(i) ind[i]->key
58 #define ASORT_SWAP(i,j) do { struct elt *t=ind[i]; ind[i]=ind[j]; ind[j]=t; } while (0)
59 #include "lib/arraysort.h"
61 static void r1_sort(void)
63 struct elt *from = ary, *to = alt, *tmp;
66 for (uns sh=0; sh<32; sh+=BITS)
68 bzero(cnt, sizeof(cnt));
69 for (uns i=0; i<n; i++)
70 cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++;
72 for (uns i=0; i<(1<<BITS); i++)
79 for (uns i=0; i<n; i++)
80 to[cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++] = from[i];
81 ASSERT(cnt[(1 << BITS)-1] == n);
82 tmp=from, from=to, to=tmp;
88 static void r1b_sort(void)
90 struct elt *from = ary, *to = alt, *tmp;
92 uns cnt[1 << BITS], cnt2[1 << BITS];
93 for (uns sh=0; sh<32; sh+=BITS)
96 memcpy(cnt, cnt2, sizeof(cnt));
99 bzero(cnt, sizeof(cnt));
100 for (uns i=0; i<n; i++)
101 cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++;
104 for (uns i=0; i<(1<<BITS); i++)
111 bzero(cnt2, sizeof(cnt2));
112 for (uns i=0; i<n; i++)
114 cnt2[(from[i].key >> (sh + BITS)) & ((1 << BITS) - 1)]++;
115 to[cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++] = from[i];
117 ASSERT(cnt[(1 << BITS)-1] == n);
118 tmp=from, from=to, to=tmp;
124 static void r1c_sort(void)
127 struct elt *ptrs[256], *x, *lim;
129 x = ary; lim = ary + n;
130 bzero(cnt, sizeof(cnt));
132 cnt[x++->key & 255]++;
134 #define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
137 x = ary; lim = ary + n;
138 bzero(cnt, sizeof(cnt));
141 cnt[(x->key >> 8) & 255]++;
142 *ptrs[x->key & 255]++ = *x;
147 x = alt; lim = alt + n;
148 bzero(cnt, sizeof(cnt));
151 cnt[(x->key >> 16) & 255]++;
152 *ptrs[(x->key >> 8) & 255]++ = *x;
157 x = ary; lim = ary + n;
158 bzero(cnt, sizeof(cnt));
161 cnt[(x->key >> 24) & 255]++;
162 *ptrs[(x->key >> 16) & 255]++ = *x;
167 x = alt; lim = alt + n;
170 *ptrs[(x->key >> 24) & 255]++ = *x;
176 #include <emmintrin.h>
178 static inline void sse_copy_elt(struct elt *to, struct elt *from)
180 __m128i m = _mm_load_si128((__m128i *) from);
181 _mm_store_si128((__m128i *) to, m);
184 static void r1c_sse_sort(void)
187 struct elt *ptrs[256], *x, *lim;
189 ASSERT(sizeof(struct elt) == 16);
190 ASSERT(!((uintptr_t)alt & 15));
191 ASSERT(!((uintptr_t)ary & 15));
193 x = ary; lim = ary + n;
194 bzero(cnt, sizeof(cnt));
196 cnt[x++->key & 255]++;
198 #define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
201 x = ary; lim = ary + n;
202 bzero(cnt, sizeof(cnt));
205 cnt[(x->key >> 8) & 255]++;
206 sse_copy_elt(ptrs[x->key & 255]++, x);
211 x = alt; lim = alt + n;
212 bzero(cnt, sizeof(cnt));
215 cnt[(x->key >> 16) & 255]++;
216 sse_copy_elt(ptrs[(x->key >> 8) & 255]++, x);
221 x = ary; lim = ary + n;
222 bzero(cnt, sizeof(cnt));
225 cnt[(x->key >> 24) & 255]++;
226 sse_copy_elt(ptrs[(x->key >> 16) & 255]++, x);
231 x = alt; lim = alt + n;
234 sse_copy_elt(ptrs[(x->key >> 24) & 255]++, x);
240 static void r1d_sort(void)
243 struct elt *ptrs[256], *x, *y, *lim;
247 x = ary; lim = ary + n;
248 bzero(cnt, sizeof(cnt));
251 cnt[x++->key & 255]++;
252 cnt[x++->key & 255]++;
253 cnt[x++->key & 255]++;
254 cnt[x++->key & 255]++;
257 #define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
260 x = ary; y = ary+n/2; lim = ary + n/2;
261 bzero(cnt, sizeof(cnt));
264 cnt[(x->key >> 8) & 255]++;
265 cnt[(y->key >> 8) & 255]++;
266 *ptrs[x->key & 255]++ = *x;
267 *ptrs[y->key & 255]++ = *y;
269 cnt[(x->key >> 8) & 255]++;
270 cnt[(y->key >> 8) & 255]++;
271 *ptrs[x->key & 255]++ = *x;
272 *ptrs[y->key & 255]++ = *y;
277 x = alt; lim = alt + n;
278 bzero(cnt, sizeof(cnt));
281 cnt[(x->key >> 16) & 255]++;
282 *ptrs[(x->key >> 8) & 255]++ = *x;
284 cnt[(x->key >> 16) & 255]++;
285 *ptrs[(x->key >> 8) & 255]++ = *x;
290 x = ary; lim = ary + n;
291 bzero(cnt, sizeof(cnt));
294 cnt[(x->key >> 24) & 255]++;
295 *ptrs[(x->key >> 16) & 255]++ = *x;
297 cnt[(x->key >> 24) & 255]++;
298 *ptrs[(x->key >> 16) & 255]++ = *x;
303 x = alt; lim = alt + n;
306 *ptrs[(x->key >> 24) & 255]++ = *x;
308 *ptrs[(x->key >> 24) & 255]++ = *x;
314 static void r2_sort(void)
316 struct elt *from = ary, *to = alt;
319 bzero(cnt, sizeof(cnt));
320 for (uns i=0; i<n; i++)
321 cnt[(from[i].key >> (32 - BITS)) & ((1 << BITS) - 1)]++;
323 for (uns i=0; i<(1<<BITS); i++)
330 for (uns i=0; i<n; i++)
331 to[cnt[(from[i].key >> (32 - BITS)) & ((1 << BITS) - 1)]++] = from[i];
332 ASSERT(cnt[(1 << BITS)-1] == n);
335 for (uns i=0; i<(1 << BITS); i++)
337 as_sort(cnt[i] - pos, alt+pos);
344 static void r3_sort(void)
348 #define BUCKS (1 << BITS)
349 #define THRESHOLD 5000
352 auto void r3(struct elt *from, struct elt *to, uns n, uns lev);
353 void r3(struct elt *from, struct elt *to, uns n, uns lev)
355 uns sh = 32 - lev*BITS;
357 bzero(cnt, sizeof(cnt));
358 for (uns i=0; i<n; i++)
359 cnt[(from[i].key >> sh) & (BUCKS - 1)]++;
361 for (uns i=0; i<BUCKS; i++)
368 for (uns i=0; i<n; i++)
370 to[cnt[(from[i].key >> sh) & (BUCKS - 1)]++] = from[i];
372 sse_copy_elt(&to[cnt[(from[i].key >> sh) & (BUCKS - 1)]++], &from[i]);
375 for (uns i=0; i<BUCKS; i++)
378 if (lev >= LEVELS || l <= THRESHOLD)
381 if ((lev % 2) != ODDEVEN)
382 memcpy(from+pos, to+pos, l * sizeof(struct elt));
385 r3(to+pos, from+pos, l, lev+1);
401 static inline struct elt *mrg(struct elt *x, struct elt *xl, struct elt *y, struct elt *yl, struct elt *z)
405 if (x->key <= y->key)
430 static void mergesort(void)
432 struct elt *from, *to;
436 struct elt *x = ary, *z = alt, *last = ary + (n & ~1U);
439 if (x[0].key < x[1].key)
440 *z++ = *x++, *z++ = *x++;
452 for (; (1U << lev) < n; lev++)
455 from = alt, to = ary;
457 from = ary, to = alt;
458 struct elt *x, *z, *last;
463 while (x + 2*step <= last)
465 z = mrg(x, x+step, x+step, x+2*step, z);
469 mrg(x, x+step, x+step, last, z);
471 memcpy(z, x, (byte*)last - (byte*)x);
477 static void sampsort(uns n, struct elt *ar, struct elt *al, struct elt *dest, byte *wbuf)
482 bzero(cnt, sizeof(cnt));
483 for (uns i=0; i<WAYS; i++)
484 k[i] = ar[random() % n];
486 for (uns i=0; i<n; i++)
489 #define FW(delta) if (ar[i].key > k[w+delta].key) w += delta
501 struct elt *y = al, *way[WAYS], *z;
502 for (uns i=0; i<WAYS; i++)
508 for (uns i=0; i<n; i++)
515 for (uns i=0; i<WAYS; i++)
518 sampsort(cnt[i], y, z, dest, wbuf);
523 memcpy(z, y, cnt[i]*sizeof(struct elt));
532 static void samplesort(void)
534 byte *aux = xmalloc(n);
535 sampsort(n, ary, alt, ary, aux);
539 static void sampsort2(uns n, struct elt *ar, struct elt *al, struct elt *dest, byte *wbuf)
544 bzero(cnt, sizeof(cnt));
545 for (uns i=0; i<WAYS; i++)
546 k[i] = ar[random() % n];
548 struct elt *k1 = ar, *k2 = ar+1, *kend = ar+n;
553 #define FW1(delta) if (k1->key > k[w1+delta].key) w1 += delta
554 #define FW2(delta) if (k2->key > k[w2+delta].key) w2 += delta
573 FW1(128); FW1(64); FW1(32); FW1(16);
574 FW1(8); FW1(4); FW1(2); FW1(1);
578 struct elt *y = al, *way[WAYS], *z;
579 for (uns i=0; i<WAYS; i++)
585 for (uns i=0; i<n; i++)
592 for (uns i=0; i<WAYS; i++)
595 sampsort2(cnt[i], y, z, dest, wbuf);
600 memcpy(z, y, cnt[i]*sizeof(struct elt));
610 static void samplesort2(void)
612 byte *aux = xmalloc(n);
613 sampsort2(n, ary, alt, ary, aux);
617 static void mk_ary(void)
621 struct MD5Context ctx;
624 bzero(block, sizeof(block));
627 for (uns i=0; i<n; i++)
633 MD5Transform(ctx.buf, block);
635 ary[i].key = ctx.buf[i%4];
637 ary[i].key = i*(~0U/(n-1));
639 for (uns j=1; j<sizeof(struct elt)/4; j++)
640 ((u32*)&ary[i])[j] = ROL(ary[i].key, 3*j);
645 static void chk_ary(void)
648 for (uns i=1; i<n; i++)
649 if (ary[i].key < ary[i-1].key)
650 die("Missorted at %d", i);
657 static void mk_ind(void)
660 ind = xmalloc(sizeof(struct elt *) * n);
661 for (uns i=0; i<n; i++)
665 static void chk_ind(void)
668 for (uns i=1; i<n; i++)
669 if (ind[i]->key < ind[i-1]->key)
670 die("Missorted at %d", i);
678 int main(int argc, char **argv)
684 while ((opt = cf_getopt(argc, argv, CF_SHORT_OPTS "1", CF_NO_LONG_OPTS, NULL)) >= 0)
688 op |= (1 << (opt - '0'));
694 array0 = alloc_elts(n);
695 array1 = alloc_elts(n);
696 for (uns i=0; i<n; i++)
697 array0[i] = array1[i] = (struct elt) { 0 };
702 for (uns i=0; i<5; i++)
705 memcpy(alt, ary, sizeof(struct elt) * n);
706 memcpy(ary, alt, sizeof(struct elt) * n);
708 for (uns j=0; j<n; j++)
710 for (uns j=0; j<n; j++)
714 log(L_DEBUG, "memcpy: %d", get_timer(&timer)/10);
716 #define BENCH(type, name, func) mk_##type(); init_timer(&timer); func; log(L_DEBUG, name ": %d", get_timer(&timer)); chk_##type()
718 //BENCH(ary, "qsort", qsort(ary, n, sizeof(struct elt), comp));
719 //BENCH(ary, "arraysort", as_sort(n, ary));
720 //BENCH(ind, "indirect qsort", qsort(ind, n, sizeof(struct elt *), comp_ind));
721 //BENCH(ind, "indirect arraysort", asi_sort(n));
722 //BENCH(ary, "radix1", r1_sort());
723 //BENCH(ary, "radix1b", r1b_sort());
724 BENCH(ary, "radix1c", r1c_sort());
725 //BENCH(ary, "radix1c-sse", r1c_sse_sort());
726 //BENCH(ary, "radix1d", r1d_sort());
727 //BENCH(ary, "radix2", r2_sort());
728 BENCH(ary, "radix3", r3_sort());
729 //BENCH(ary, "mergesort", mergesort());
730 //BENCH(ary, "samplesort", samplesort());
731 //BENCH(ary, "samplesort2", samplesort2());
733 free_elts(array0, n);
734 free_elts(array1, n);