From: Martin Mares Date: Tue, 13 Jul 2010 11:44:35 +0000 (+0200) Subject: Cleanup: Moved sorter debugging tools to ucw/sorter/debug/ X-Git-Tag: v5.0~168 X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=44feaeb65636c36e71fa1fd79710aa746867c17e;p=libucw.git Cleanup: Moved sorter debugging tools to ucw/sorter/debug/ --- diff --git a/debug/sorter/BENCH b/debug/sorter/BENCH deleted file mode 100644 index ad3a5f4a..00000000 --- a/debug/sorter/BENCH +++ /dev/null @@ -1,73 +0,0 @@ -How retros runs on different hardware: - -# 32-bit Athlon 64, gcc-4.1 -mj@albireo:~/src/sh/dev-sorter/run$ bin/retros -D 2006-11-23 23:17:36 [retros] memcpy: 212 -D 2006-11-23 23:17:44 [retros] qsort: 6947 -D 2006-11-23 23:17:48 [retros] arraysort: 3183 -D 2006-11-23 23:18:02 [retros] indirect qsort: 13116 -D 2006-11-23 23:18:24 [retros] indirect arraysort: 19176 -D 2006-11-23 23:18:30 [retros] radix1: 3755 -D 2006-11-23 23:18:34 [retros] radix1b: 3100 -D 2006-11-23 23:18:39 [retros] radix1c: 2777 -D 2006-11-23 23:18:43 [retros] radix1c-sse: 2602 -D 2006-11-23 23:18:47 [retros] radix1d: 2728 -D 2006-11-23 23:18:53 [retros] radix2: 4249 -D 2006-11-23 23:18:57 [retros] radix3: 2577 -D 2006-11-23 23:19:09 [retros] mergesort: 10399 -D 2006-11-23 23:19:16 [retros] samplesort: 5698 -D 2006-11-23 23:19:23 [retros] samplesort2: 5016 - -# 32-bit P4 Xeon, gcc-3.4 -sherlock@sherlock3:~/sherlock-mj/run$ bin/retros -D 2006-11-23 23:23:52 [retros] memcpy: 198 -D 2006-11-23 23:24:23 [retros] qsort: 30114 -D 2006-11-23 23:24:27 [retros] arraysort: 2882 -D 2006-11-23 23:24:43 [retros] indirect qsort: 15019 -D 2006-11-23 23:24:59 [retros] indirect arraysort: 13267 -D 2006-11-23 23:25:03 [retros] radix1: 1881 -D 2006-11-23 23:25:06 [retros] radix1b: 1442 -D 2006-11-23 23:25:08 [retros] radix1c: 1313 -D 2006-11-23 23:25:10 [retros] radix1c-sse: 1229 -D 2006-11-23 23:25:13 [retros] radix1d: 1324 -D 2006-11-23 23:25:17 [retros] radix2: 2598 -D 2006-11-23 23:25:19 [retros] radix3: 1419 -D 2006-11-23 23:25:25 [retros] mergesort: 4929 -D 2006-11-23 23:25:29 [retros] samplesort: 2742 -D 2006-11-23 23:25:33 [retros] samplesort2: 2350 - -# 64-bit P4 Xeon, gcc-3.4 -sherlock@sherlock4:~/sherlock-3.10/run$ bin/retros -D 2006-11-23 23:44:31 [retros] memcpy: 132 -D 2006-11-23 23:44:58 [retros] qsort: 26469 -D 2006-11-23 23:45:01 [retros] arraysort: 2307 -D 2006-11-23 23:45:12 [retros] indirect qsort: 10971 -D 2006-11-23 23:45:24 [retros] indirect arraysort: 10350 -D 2006-11-23 23:45:26 [retros] radix1: 1099 -D 2006-11-23 23:45:27 [retros] radix1b: 1052 -D 2006-11-23 23:45:29 [retros] radix1c: 1017 -D 2006-11-23 23:45:30 [retros] radix1c-sse: 1017 -D 2006-11-23 23:45:32 [retros] radix1d: 1016 -D 2006-11-23 23:45:34 [retros] radix2: 1661 -D 2006-11-23 23:45:36 [retros] radix3: 955 -D 2006-11-23 23:45:39 [retros] mergesort: 3302 -D 2006-11-23 23:45:42 [retros] samplesort: 2376 -D 2006-11-23 23:45:45 [retros] samplesort2: 1870 - -# 64-bit Turion X2 TL52, gcc-4.1.1 -pchar@paja ~/prog/sherlock-dev-sorter/run $ bin/retros -D 2006-11-24 00:32:38 [retros] memcpy: 93 -D 2006-11-24 00:32:46 [retros] qsort: 7530 -D 2006-11-24 00:32:50 [retros] arraysort: 2766 -D 2006-11-24 00:33:01 [retros] indirect qsort: 10543 -D 2006-11-24 00:33:13 [retros] indirect arraysort: 10169 -D 2006-11-24 00:33:16 [retros] radix1: 1319 -D 2006-11-24 00:33:18 [retros] radix1b: 1126 -D 2006-11-24 00:33:20 [retros] radix1c: 1084 -D 2006-11-24 00:33:22 [retros] radix1c-sse: 1126 -D 2006-11-24 00:33:24 [retros] radix1d: 1091 -D 2006-11-24 00:33:27 [retros] radix2: 2238 -D 2006-11-24 00:33:29 [retros] radix3: 1183 -D 2006-11-24 00:33:34 [retros] mergesort: 4036 -D 2006-11-24 00:33:37 [retros] samplesort: 2594 -D 2006-11-24 00:33:40 [retros] samplesort2: 2214 diff --git a/debug/sorter/Makefile b/debug/sorter/Makefile deleted file mode 100644 index 1ef8c620..00000000 --- a/debug/sorter/Makefile +++ /dev/null @@ -1,10 +0,0 @@ -# Tests related to the new sorter - -DIRS+=debug/sorter -PROGS+=$(addprefix $(o)/debug/sorter/,radix-tune-bits radix-tune-thresh) - -$(o)/debug/sorter/retros: $(o)/debug/sorter/retros.o $(LIBSH) -$(o)/debug/sorter/radix-file-test: $(o)/debug/sorter/radix-file-test.o $(LIBSH) -$(o)/debug/sorter/radix-asio-test: $(o)/debug/sorter/radix-asio-test.o $(LIBSH) -$(o)/debug/sorter/radix-tune-bits: $(s)/debug/sorter/radix-tune-bits.sh -$(o)/debug/sorter/radix-tune-thresh: $(s)/debug/sorter/radix-tune-thresh.sh diff --git a/debug/sorter/radix-asio-test.c b/debug/sorter/radix-asio-test.c deleted file mode 100644 index 168f6ff6..00000000 --- a/debug/sorter/radix-asio-test.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * An experiment with parallel reading and writing of files using ASIO. - * - * (c) 2007 Martin Mares - */ - -#include "ucw/lib.h" -#include "ucw/conf.h" -#include "ucw/lfs.h" -#include "ucw/asio.h" - -#include -#include -#include -#include -#include - -#define COPY -#define DIRECT O_DIRECT - -static timestamp_t timer; - -#define P_INIT do { cnt = 0; cnt_rep = 0; cnt_ms = 1; } while(0) -#define P_UPDATE(cc) do { \ - cnt += cc; \ - if (cnt >= cnt_rep) { cnt_ms += get_timer(&timer); \ - printf("%d of %d MB (%.2f MB/sec)\r", (int)(cnt >> 20), (int)(total_size >> 20), (double)cnt / 1048576 * 1000 / cnt_ms); \ - fflush(stdout); cnt_rep += 1<<26; } } while(0) -#define P_FINAL do { \ - cnt_ms += get_timer(&timer); \ - msg(L_INFO, "Spent %.3f sec (%.2f MB/sec)", (double)cnt_ms/1000, (double)cnt / 1048576 * 1000 / cnt_ms); \ -} while(0) - -static struct asio_queue io_queue; - -int main(int argc, char **argv) -{ - uns files, bufsize; - u64 total_size; - if (argc != 4 || - cf_parse_int(argv[1], (int*) &files) || - cf_parse_int(argv[2], (int*) &bufsize) || - cf_parse_u64(argv[3], &total_size)) - { - fprintf(stderr, "Usage: asio-test \n"); - return 1; - } - u64 cnt, cnt_rep; - uns cnt_ms; - int fd[files]; - byte name[files][16]; - struct asio_request *req[files]; - - init_timer(&timer); - - io_queue.buffer_size = bufsize; - io_queue.max_writebacks = 2; - asio_init_queue(&io_queue); - -#ifdef COPY - msg(L_INFO, "Creating input file"); - int in_fd = ucw_open("tmp/ft-in", O_RDWR | O_CREAT | O_TRUNC | DIRECT, 0666); - ASSERT(in_fd >= 0); - ASSERT(!(total_size % bufsize)); - P_INIT; - for (uns i=0; iop = ASIO_WRITE_BACK; - r->fd = in_fd; - r->len = bufsize; - byte *xbuf = r->buffer; - for (uns j=0; j> 20), files, bufsize); - P_INIT; - for (uns i=0; iop = ASIO_READ; - rd->fd = in_fd; - rd->len = bufsize; - asio_submit(rd); - rr = asio_wait(&io_queue); - ASSERT(rr == rd && rd->status == (int)rd->len); - memcpy(r->buffer, rd->buffer, bufsize); - asio_put(rr); -#else - for (uns j=0; jbuffer[j] = round+i+j; -#endif - r->op = ASIO_WRITE_BACK; - r->fd = fd[i]; - r->len = bufsize; - asio_submit(r); - P_UPDATE(bufsize); - req[i] = asio_get(&io_queue); - } - } - for (uns i=0; iop = ASIO_READ; - r->fd = fd[i]; - r->len = bufsize; - asio_submit(r); - rr = asio_wait(&io_queue); - ASSERT(rr == r && r->status == (int)bufsize); - asio_put(r); - P_UPDATE(bufsize); - } - close(fd[i]); - } - P_FINAL; - - for (uns i=0; i - */ - -#include "ucw/lib.h" -#include "ucw/conf.h" -#include "ucw/lfs.h" - -#include -#include -#include -#include - -#define COPY -#define DIRECT 0 // or O_DIRECT - -static timestamp_t timer; - -#define P_INIT do { cnt = 0; cnt_rep = 0; cnt_ms = 1; } while(0) -#define P_UPDATE(cc) do { \ - cnt += cc; \ - if (cnt >= cnt_rep) { cnt_ms += get_timer(&timer); \ - printf("%d of %d MB (%.2f MB/sec)\r", (int)(cnt >> 20), (int)(total_size >> 20), (double)cnt / 1048576 * 1000 / cnt_ms); \ - fflush(stdout); cnt_rep += 1<<26; } } while(0) -#define P_FINAL do { \ - cnt_ms += get_timer(&timer); \ - msg(L_INFO, "Spent %.3f sec (%.2f MB/sec)", (double)cnt_ms/1000, (double)cnt / 1048576 * 1000 / cnt_ms); \ -} while(0) - -int main(int argc, char **argv) -{ - uns files, bufsize; - u64 total_size; - if (argc != 4 || - cf_parse_int(argv[1], (int*) &files) || - cf_parse_int(argv[2], (int*) &bufsize) || - cf_parse_u64(argv[3], &total_size)) - { - fprintf(stderr, "Usage: file-test \n"); - return 1; - } - u64 cnt, cnt_rep; - uns cnt_ms; - int fd[files]; - byte *buf[files], name[files][16]; - uns xbufsize = bufsize; // Used for single-file I/O - byte *xbuf = big_alloc(xbufsize); - - init_timer(&timer); - -#ifdef COPY - msg(L_INFO, "Creating input file"); - int in_fd = ucw_open("tmp/ft-in", O_RDWR | O_CREAT | O_TRUNC | DIRECT, 0666); - ASSERT(in_fd >= 0); - ASSERT(!(total_size % xbufsize)); - P_INIT; - for (uns i=0; i> 20), files, bufsize); - P_INIT; - for (uns r=0; r -set -e -UCW_PROGNAME="$0" -. lib/libucw.sh - -# Path to Sherlock build directory -[ -n "$BUILD" ] || BUILD=.. -[ -f "$BUILD/ucw/sorter/sorter.h" ] || die "BUILD does not point to Sherlock build directory" - -# Find out sort buffer size -parse-config 'Sorter{##SortBuffer}' -SORTBUF=$CF_Sorter_SortBuffer -[ "$SORTBUF" -gt 0 ] || die "Unable to determine SortBuffer" -log "Detected sort buffer size $SORTBUF" - -# Size of the test -- should be slightly less than a half of SortBuffer -SIZE=$(($SORTBUF/2 - 8192)) -log "Decided to benchmark sorting of $SIZE byte data" - -# Which bit widths we try -WIDTHS="0 6 7 8 9 10 11 12 13 14" - -# Which RadixThresholds we try -THRS="2000 4000 10000 20000 50000" - -# Which sort-test tests we try -TESTS="2,5,8,15" - -# Check various bit widths of the radix sorter -rm -f tmp/radix-* -for W in $WIDTHS ; do - rm -f $BUILD/obj/ucw/sorter/sort-test{,.o} - if [ $W = 0 ] ; then - log "Compiling with no radix splits" - ( cd $BUILD && make obj/ucw/sorter/sort-test ) - OPT="-d32" - else - log "Compiling with $W-bit radix splits" - ( cd $BUILD && make CEXTRA="-DFORCE_RADIX_BITS=$W" obj/ucw/sorter/sort-test ) - OPT= - fi - for THR in $THRS ; do - log "Testing with RadixThreshold=$THR" - $BUILD/obj/ucw/sorter/sort-test -SThreads.DefaultStackSize=2M -SSorter.RadixThreshold=$THR -s$SIZE -t$TESTS $OPT -v 2>&1 | tee -a tmp/radix-$W - done -done - -echo "thresh" >tmp/radix-thrs -echo "test#" >tmp/radix-tests -for THR in $THRS ; do - for TEST in `echo $TESTS | tr ',' ' '` ; do - echo $THR >>tmp/radix-thrs - echo $TEST >>tmp/radix-tests - done -done - -FILES="tmp/radix-thrs tmp/radix-tests" -for W in $WIDTHS ; do - a=tmp/radix-$W - echo >$a.out "$W bits" - sed 's/.* \([0-9.]\+\)s internal sorting.*/\1/;t;d' <$a >>$a.out - FILES="$FILES $a.out" -done - -log "These are the results:" -paste $FILES diff --git a/debug/sorter/radix-tune-thresh.sh b/debug/sorter/radix-tune-thresh.sh deleted file mode 100644 index 71d929b3..00000000 --- a/debug/sorter/radix-tune-thresh.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash -# An utility for tuning the Sherlock's radix sorter threshold -# (c) 2007 Martin Mares -set -e -UCW_PROGNAME="$0" -. lib/libucw.sh - -# Path to Sherlock build directory -[ -n "$BUILD" ] || BUILD=.. -[ -f "$BUILD/ucw/sorter/sorter.h" ] || die "BUILD does not point to Sherlock build directory" - -# Find out sort buffer size -parse-config 'Sorter{##SortBuffer}' -SORTBUF=$CF_Sorter_SortBuffer -[ "$SORTBUF" -gt 0 ] || die "Unable to determine SortBuffer" -log "Detected sort buffer size $SORTBUF" - -# Find out radix-sorter width -[ -f "$BUILD/obj/config.mk" ] || die "Sherlock source not configured" -WIDTH=`sed <$BUILD/obj/config.mk 's/^CONFIG_UCW_RADIX_SORTER_BITS=\(.*\)/\1/;t;d'` -[ -n "$WIDTH" ] || die "CONFIG_UCW_RADIX_SORTER_BITS not set (!?)" -log "Detected radix-sorter width $WIDTH" - -# Maximum size of the test -- should be slightly less than a half of SortBuffer -SIZE=$(($SORTBUF/2 - 8192)) - -# Which sort-test test we try -TEST="2" - -# Which thresholds we try -THRS="16" -T=$SIZE -while [ $T -gt 100 ] ; do - THRS="$THRS $T" - T=$(($T/2)) -done - -if true ; then - -rm -f tmp/radix-* -echo "sizes" >tmp/radix-sizes -while [ $SIZE -gt 262144 ] ; do - echo $SIZE >>tmp/radix-sizes - for T in $THRS ; do - log "Trying size $SIZE with threshold $T" - $BUILD/obj/ucw/sorter/sort-test -SSorter.RadixThreshold=$T -s$SIZE -t$TEST -v 2>&1 | tee -a tmp/radix-$T - done - SIZE=$(($SIZE/2)) -done - -fi - -FILES=tmp/radix-sizes -for T in $THRS ; do - a=tmp/radix-$T - echo >$a.out $T - sed 's/.* \([0-9.]\+\)s internal sorting.*/\1/;t;d' <$a >>$a.out - FILES="$FILES $a.out" -done - -log "These are the results:" -paste $FILES diff --git a/debug/sorter/retros.c b/debug/sorter/retros.c deleted file mode 100644 index 7bb692f6..00000000 --- a/debug/sorter/retros.c +++ /dev/null @@ -1,763 +0,0 @@ -/* - * Experiments with various sorting algorithms - * - * (c) 2007--2008 Martin Mares - */ - -#include "sherlock/sherlock.h" -#include "ucw/getopt.h" -#include "ucw/md5.h" -#include "ucw/heap.h" - -#include -#include -#include -#include -#include - -struct elt { - u32 key; - u32 ballast[3]; -}; - -static struct elt *ary, *alt, **ind, *array0, *array1; -static uns n = 10000000; -static u32 sum; - -static struct elt *alloc_elts(uns n) -{ - return big_alloc(n * sizeof(struct elt)); -} - -static void free_elts(struct elt *a, uns n) -{ - big_free(a, n * sizeof(struct elt)); -} - -static int comp(const void *x, const void *y) -{ - const struct elt *xx = x, *yy = y; - return (xx->key < yy->key) ? -1 : (xx->key > yy->key) ? 1 : 0; -} - -static int comp_ind(const void *x, const void *y) -{ - const struct elt * const *xx = x, * const *yy = y; - return comp(*xx, *yy); -} - -#define ASORT_PREFIX(x) as_##x -#define ASORT_KEY_TYPE u32 -#define ASORT_ELT(i) a[i].key -#define ASORT_SWAP(i,j) do { struct elt t=a[i]; a[i]=a[j]; a[j]=t; } while (0) -#define ASORT_EXTRA_ARGS , struct elt *a -#include "ucw/sorter/array-simple.h" - -#define ASORT_PREFIX(x) asi_##x -#define ASORT_KEY_TYPE u32 -#define ASORT_ELT(i) ind[i]->key -#define ASORT_SWAP(i,j) do { struct elt *t=ind[i]; ind[i]=ind[j]; ind[j]=t; } while (0) -#include "ucw/sorter/array-simple.h" - -static void r1_sort(void) -{ - struct elt *from = ary, *to = alt, *tmp; -#define BITS 8 - uns cnt[1 << BITS]; - for (uns sh=0; sh<32; sh+=BITS) - { - bzero(cnt, sizeof(cnt)); - for (uns i=0; i> sh) & ((1 << BITS) - 1)]++; - uns pos = 0; - for (uns i=0; i<(1<> sh) & ((1 << BITS) - 1)]++] = from[i]; - ASSERT(cnt[(1 << BITS)-1] == n); - tmp=from, from=to, to=tmp; - } - ary = from; -#undef BITS -} - -static void r1b_sort(void) -{ - struct elt *from = ary, *to = alt, *tmp; -#define BITS 8 - uns cnt[1 << BITS], cnt2[1 << BITS]; - for (uns sh=0; sh<32; sh+=BITS) - { - if (sh) - memcpy(cnt, cnt2, sizeof(cnt)); - else - { - bzero(cnt, sizeof(cnt)); - for (uns i=0; i> sh) & ((1 << BITS) - 1)]++; - } - uns pos = 0; - for (uns i=0; i<(1<> (sh + BITS)) & ((1 << BITS) - 1)]++; - to[cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++] = from[i]; - } - ASSERT(cnt[(1 << BITS)-1] == n); - tmp=from, from=to, to=tmp; - } - ary = from; -#undef BITS -} - -static void r1c_sort(void) -{ - uns cnt[256]; - struct elt *ptrs[256], *x, *lim; - - x = ary; lim = ary + n; - bzero(cnt, sizeof(cnt)); - while (x < lim) - cnt[x++->key & 255]++; - -#define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; } - - PTRS(alt); - x = ary; lim = ary + n; - bzero(cnt, sizeof(cnt)); - while (x < lim) - { - cnt[(x->key >> 8) & 255]++; - *ptrs[x->key & 255]++ = *x; - x++; - } - - PTRS(ary); - x = alt; lim = alt + n; - bzero(cnt, sizeof(cnt)); - while (x < lim) - { - cnt[(x->key >> 16) & 255]++; - *ptrs[(x->key >> 8) & 255]++ = *x; - x++; - } - - PTRS(alt); - x = ary; lim = ary + n; - bzero(cnt, sizeof(cnt)); - while (x < lim) - { - cnt[(x->key >> 24) & 255]++; - *ptrs[(x->key >> 16) & 255]++ = *x; - x++; - } - - PTRS(ary); - x = alt; lim = alt + n; - while (x < lim) - { - *ptrs[(x->key >> 24) & 255]++ = *x; - x++; - } -#undef PTRS -} - -#include - -static inline void sse_copy_elt(struct elt *to, struct elt *from) -{ - __m128i m = _mm_load_si128((__m128i *) from); - _mm_store_si128((__m128i *) to, m); -} - -static void r1c_sse_sort(void) -{ - uns cnt[256]; - struct elt *ptrs[256], *x, *lim; - - ASSERT(sizeof(struct elt) == 16); - ASSERT(!((uintptr_t)alt & 15)); - ASSERT(!((uintptr_t)ary & 15)); - - x = ary; lim = ary + n; - bzero(cnt, sizeof(cnt)); - while (x < lim) - cnt[x++->key & 255]++; - -#define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; } - - PTRS(alt); - x = ary; lim = ary + n; - bzero(cnt, sizeof(cnt)); - while (x < lim) - { - cnt[(x->key >> 8) & 255]++; - sse_copy_elt(ptrs[x->key & 255]++, x); - x++; - } - - PTRS(ary); - x = alt; lim = alt + n; - bzero(cnt, sizeof(cnt)); - while (x < lim) - { - cnt[(x->key >> 16) & 255]++; - sse_copy_elt(ptrs[(x->key >> 8) & 255]++, x); - x++; - } - - PTRS(alt); - x = ary; lim = ary + n; - bzero(cnt, sizeof(cnt)); - while (x < lim) - { - cnt[(x->key >> 24) & 255]++; - sse_copy_elt(ptrs[(x->key >> 16) & 255]++, x); - x++; - } - - PTRS(ary); - x = alt; lim = alt + n; - while (x < lim) - { - sse_copy_elt(ptrs[(x->key >> 24) & 255]++, x); - x++; - } -#undef PTRS -} - -static void r1d_sort(void) -{ - uns cnt[256]; - struct elt *ptrs[256], *x, *y, *lim; - - ASSERT(!(n % 4)); - - x = ary; lim = ary + n; - bzero(cnt, sizeof(cnt)); - while (x < lim) - { - cnt[x++->key & 255]++; - cnt[x++->key & 255]++; - cnt[x++->key & 255]++; - cnt[x++->key & 255]++; - } - -#define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; } - - PTRS(alt); - x = ary; y = ary+n/2; lim = ary + n/2; - bzero(cnt, sizeof(cnt)); - while (x < lim) - { - cnt[(x->key >> 8) & 255]++; - cnt[(y->key >> 8) & 255]++; - *ptrs[x->key & 255]++ = *x; - *ptrs[y->key & 255]++ = *y; - x++, y++; - cnt[(x->key >> 8) & 255]++; - cnt[(y->key >> 8) & 255]++; - *ptrs[x->key & 255]++ = *x; - *ptrs[y->key & 255]++ = *y; - x++, y++; - } - - PTRS(ary); - x = alt; lim = alt + n; - bzero(cnt, sizeof(cnt)); - while (x < lim) - { - cnt[(x->key >> 16) & 255]++; - *ptrs[(x->key >> 8) & 255]++ = *x; - x++; - cnt[(x->key >> 16) & 255]++; - *ptrs[(x->key >> 8) & 255]++ = *x; - x++; - } - - PTRS(alt); - x = ary; lim = ary + n; - bzero(cnt, sizeof(cnt)); - while (x < lim) - { - cnt[(x->key >> 24) & 255]++; - *ptrs[(x->key >> 16) & 255]++ = *x; - x++; - cnt[(x->key >> 24) & 255]++; - *ptrs[(x->key >> 16) & 255]++ = *x; - x++; - } - - PTRS(ary); - x = alt; lim = alt + n; - while (x < lim) - { - *ptrs[(x->key >> 24) & 255]++ = *x; - x++; - *ptrs[(x->key >> 24) & 255]++ = *x; - x++; - } -#undef PTRS -} - -static void r2_sort(void) -{ - struct elt *from = ary, *to = alt; -#define BITS 14 - uns cnt[1 << BITS]; - bzero(cnt, sizeof(cnt)); - for (uns i=0; i> (32 - BITS)) & ((1 << BITS) - 1)]++; - uns pos = 0; - for (uns i=0; i<(1<> (32 - BITS)) & ((1 << BITS) - 1)]++] = from[i]; - ASSERT(cnt[(1 << BITS)-1] == n); - - pos = 0; - for (uns i=0; i<(1 << BITS); i++) - { - as_sort(cnt[i] - pos, alt+pos); - pos = cnt[i]; - } - ary = alt; -#undef BITS -} - -static void r3_sort(void) -{ -#define BITS 10 -#define LEVELS 2 -#define BUCKS (1 << BITS) -#define THRESHOLD 5000 -#define ODDEVEN 0 - - auto void r3(struct elt *from, struct elt *to, uns n, uns lev); - void r3(struct elt *from, struct elt *to, uns n, uns lev) - { - uns sh = 32 - lev*BITS; - uns cnt[BUCKS]; - bzero(cnt, sizeof(cnt)); - for (uns i=0; i> sh) & (BUCKS - 1)]++; - uns pos = 0; - for (uns i=0; i> sh) & (BUCKS - 1)]++] = from[i]; -#else - sse_copy_elt(&to[cnt[(from[i].key >> sh) & (BUCKS - 1)]++], &from[i]); -#endif - pos = 0; - for (uns i=0; i= LEVELS || l <= THRESHOLD) - { - as_sort(l, to+pos); - if ((lev % 2) != ODDEVEN) - memcpy(from+pos, to+pos, l * sizeof(struct elt)); - } - else - r3(to+pos, from+pos, l, lev+1); - pos = cnt[i]; - } - } - - r3(ary, alt, n, 1); - if (ODDEVEN) - ary = alt; - -#undef ODDEVEN -#undef THRESHOLD -#undef BUCKS -#undef LEVELS -#undef BITS -} - -static inline struct elt *mrg(struct elt *x, struct elt *xl, struct elt *y, struct elt *yl, struct elt *z) -{ - for (;;) - { - if (x->key <= y->key) - { - *z++ = *x++; - if (x >= xl) - goto xend; - } - else - { - *z++ = *y++; - if (y >= yl) - goto yend; - } - } - - xend: - while (y < yl) - *z++ = *y++; - return z; - - yend: - while (x < xl) - *z++ = *x++; - return z; -} - -static void mergesort(void) -{ - struct elt *from, *to; - uns lev = 0; - if (1) - { - struct elt *x = ary, *z = alt, *last = ary + (n & ~1U); - while (x < last) - { - if (x[0].key < x[1].key) - *z++ = *x++, *z++ = *x++; - else - { - *z++ = x[1]; - *z++ = x[0]; - x += 2; - } - } - if (n % 2) - *z = *x; - lev++; - } - for (; (1U << lev) < n; lev++) - { - if (lev % 2) - from = alt, to = ary; - else - from = ary, to = alt; - struct elt *x, *z, *last; - x = from; - z = to; - last = from + n; - uns step = 1 << lev; - while (x + 2*step <= last) - { - z = mrg(x, x+step, x+step, x+2*step, z); - x += 2*step; - } - if (x + step < last) - mrg(x, x+step, x+step, last, z); - else - memcpy(z, x, (byte*)last - (byte*)x); - } - if (lev % 2) - ary = alt; -} - -static void sampsort(uns n, struct elt *ar, struct elt *al, struct elt *dest, byte *wbuf) -{ -#define WAYS 256 - struct elt k[WAYS]; - uns cnt[WAYS]; - bzero(cnt, sizeof(cnt)); - for (uns i=0; i k[w+delta].key) w += delta - FW(128); - FW(64); - FW(32); - FW(16); - FW(8); - FW(4); - FW(2); - FW(1); - wbuf[i] = w; - cnt[w]++; - } - struct elt *y = al, *way[WAYS], *z; - for (uns i=0; i= 1000) - sampsort(cnt[i], y, z, dest, wbuf); - else - { - as_sort(cnt[i], y); - if (al != dest) - memcpy(z, y, cnt[i]*sizeof(struct elt)); - } - y += cnt[i]; - z += cnt[i]; - } -#undef FW -#undef WAYS -} - -static void samplesort(void) -{ - byte *aux = xmalloc(n); - sampsort(n, ary, alt, ary, aux); - xfree(aux); -} - -static void sampsort2(uns n, struct elt *ar, struct elt *al, struct elt *dest, byte *wbuf) -{ -#define WAYS 256 - struct elt k[WAYS]; - uns cnt[WAYS]; - bzero(cnt, sizeof(cnt)); - for (uns i=0; ikey > k[w1+delta].key) w1 += delta -#define FW2(delta) if (k2->key > k[w2+delta].key) w2 += delta - FW1(128); FW2(128); - FW1(64); FW2(64); - FW1(32); FW2(32); - FW1(16); FW2(16); - FW1(8); FW2(8); - FW1(4); FW2(4); - FW1(2); FW2(2); - FW1(1); FW2(1); - *ww++ = w1; - *ww++ = w2; - cnt[w1]++; - cnt[w2]++; - k1 += 2; - k2 += 2; - } - if (k1 < kend) - { - uns w1 = 0; - FW1(128); FW1(64); FW1(32); FW1(16); - FW1(8); FW1(4); FW1(2); FW1(1); - *ww++ = w1; - cnt[w1]++; - } - struct elt *y = al, *way[WAYS], *z; - for (uns i=0; i= 1000) - sampsort2(cnt[i], y, z, dest, wbuf); - else - { - as_sort(cnt[i], y); - if (al != dest) - memcpy(z, y, cnt[i]*sizeof(struct elt)); - } - y += cnt[i]; - z += cnt[i]; - } -#undef FW1 -#undef FW2 -#undef WAYS -} - -static void samplesort2(void) -{ - byte *aux = xmalloc(n); - sampsort2(n, ary, alt, ary, aux); - xfree(aux); -} - -static void heapsort(void) -{ -#define H_LESS(_a,_b) ((_a).key > (_b).key) - struct elt *heap = ary-1; - HEAP_INIT(struct elt, heap, n, H_LESS, HEAP_SWAP); - uns nn = n; - while (nn) - HEAP_DELMIN(struct elt, heap, nn, H_LESS, HEAP_SWAP); -#undef H_LESS -} - -static void heapsort_ind(void) -{ -#define H_LESS(_a,_b) ((_a)->key > (_b)->key) - struct elt **heap = ind-1; - HEAP_INIT(struct elt *, heap, n, H_LESS, HEAP_SWAP); - uns nn = n; - while (nn) - HEAP_DELMIN(struct elt *, heap, nn, H_LESS, HEAP_SWAP); -#undef H_LESS -} - -static void mk_ary(void) -{ - ary = array0; - alt = array1; - md5_context ctx; - md5_init(&ctx); - u32 block[16]; - bzero(block, sizeof(block)); - - sum = 0; - for (uns i=0; ikey; - for (uns i=1; ikey < ind[i-1]->key) - die("Missorted at %d", i); - else - s ^= ind[i]->key; - if (s != sum) - die("Corrupted"); - xfree(ind); -} - -int main(int argc, char **argv) -{ - log_init(argv[0]); - - int opt; - uns op = 0; - while ((opt = cf_getopt(argc, argv, CF_SHORT_OPTS "1", CF_NO_LONG_OPTS, NULL)) >= 0) - switch (opt) - { - case '1': - op |= (1 << (opt - '0')); - break; - default: - die("usage?"); - } - - array0 = alloc_elts(n); - array1 = alloc_elts(n); - for (uns i=0; i + */ + +#include "ucw/lib.h" +#include "ucw/conf.h" +#include "ucw/lfs.h" +#include "ucw/asio.h" + +#include +#include +#include +#include +#include + +#define COPY +#define DIRECT O_DIRECT + +static timestamp_t timer; + +#define P_INIT do { cnt = 0; cnt_rep = 0; cnt_ms = 1; } while(0) +#define P_UPDATE(cc) do { \ + cnt += cc; \ + if (cnt >= cnt_rep) { cnt_ms += get_timer(&timer); \ + printf("%d of %d MB (%.2f MB/sec)\r", (int)(cnt >> 20), (int)(total_size >> 20), (double)cnt / 1048576 * 1000 / cnt_ms); \ + fflush(stdout); cnt_rep += 1<<26; } } while(0) +#define P_FINAL do { \ + cnt_ms += get_timer(&timer); \ + msg(L_INFO, "Spent %.3f sec (%.2f MB/sec)", (double)cnt_ms/1000, (double)cnt / 1048576 * 1000 / cnt_ms); \ +} while(0) + +static struct asio_queue io_queue; + +int main(int argc, char **argv) +{ + uns files, bufsize; + u64 total_size; + if (argc != 4 || + cf_parse_int(argv[1], (int*) &files) || + cf_parse_int(argv[2], (int*) &bufsize) || + cf_parse_u64(argv[3], &total_size)) + { + fprintf(stderr, "Usage: asio-test \n"); + return 1; + } + u64 cnt, cnt_rep; + uns cnt_ms; + int fd[files]; + byte name[files][16]; + struct asio_request *req[files]; + + init_timer(&timer); + + io_queue.buffer_size = bufsize; + io_queue.max_writebacks = 2; + asio_init_queue(&io_queue); + +#ifdef COPY + msg(L_INFO, "Creating input file"); + int in_fd = ucw_open("tmp/ft-in", O_RDWR | O_CREAT | O_TRUNC | DIRECT, 0666); + ASSERT(in_fd >= 0); + ASSERT(!(total_size % bufsize)); + P_INIT; + for (uns i=0; iop = ASIO_WRITE_BACK; + r->fd = in_fd; + r->len = bufsize; + byte *xbuf = r->buffer; + for (uns j=0; j> 20), files, bufsize); + P_INIT; + for (uns i=0; iop = ASIO_READ; + rd->fd = in_fd; + rd->len = bufsize; + asio_submit(rd); + rr = asio_wait(&io_queue); + ASSERT(rr == rd && rd->status == (int)rd->len); + memcpy(r->buffer, rd->buffer, bufsize); + asio_put(rr); +#else + for (uns j=0; jbuffer[j] = round+i+j; +#endif + r->op = ASIO_WRITE_BACK; + r->fd = fd[i]; + r->len = bufsize; + asio_submit(r); + P_UPDATE(bufsize); + req[i] = asio_get(&io_queue); + } + } + for (uns i=0; iop = ASIO_READ; + r->fd = fd[i]; + r->len = bufsize; + asio_submit(r); + rr = asio_wait(&io_queue); + ASSERT(rr == r && r->status == (int)bufsize); + asio_put(r); + P_UPDATE(bufsize); + } + close(fd[i]); + } + P_FINAL; + + for (uns i=0; i + */ + +#include "ucw/lib.h" +#include "ucw/conf.h" +#include "ucw/lfs.h" + +#include +#include +#include +#include + +#define COPY +#define DIRECT 0 // or O_DIRECT + +static timestamp_t timer; + +#define P_INIT do { cnt = 0; cnt_rep = 0; cnt_ms = 1; } while(0) +#define P_UPDATE(cc) do { \ + cnt += cc; \ + if (cnt >= cnt_rep) { cnt_ms += get_timer(&timer); \ + printf("%d of %d MB (%.2f MB/sec)\r", (int)(cnt >> 20), (int)(total_size >> 20), (double)cnt / 1048576 * 1000 / cnt_ms); \ + fflush(stdout); cnt_rep += 1<<26; } } while(0) +#define P_FINAL do { \ + cnt_ms += get_timer(&timer); \ + msg(L_INFO, "Spent %.3f sec (%.2f MB/sec)", (double)cnt_ms/1000, (double)cnt / 1048576 * 1000 / cnt_ms); \ +} while(0) + +int main(int argc, char **argv) +{ + uns files, bufsize; + u64 total_size; + if (argc != 4 || + cf_parse_int(argv[1], (int*) &files) || + cf_parse_int(argv[2], (int*) &bufsize) || + cf_parse_u64(argv[3], &total_size)) + { + fprintf(stderr, "Usage: file-test \n"); + return 1; + } + u64 cnt, cnt_rep; + uns cnt_ms; + int fd[files]; + byte *buf[files], name[files][16]; + uns xbufsize = bufsize; // Used for single-file I/O + byte *xbuf = big_alloc(xbufsize); + + init_timer(&timer); + +#ifdef COPY + msg(L_INFO, "Creating input file"); + int in_fd = ucw_open("tmp/ft-in", O_RDWR | O_CREAT | O_TRUNC | DIRECT, 0666); + ASSERT(in_fd >= 0); + ASSERT(!(total_size % xbufsize)); + P_INIT; + for (uns i=0; i> 20), files, bufsize); + P_INIT; + for (uns r=0; r +set -e +UCW_PROGNAME="$0" +. lib/libucw.sh + +# Path to Sherlock build directory +[ -n "$BUILD" ] || BUILD=.. +[ -f "$BUILD/ucw/sorter/sorter.h" ] || die "BUILD does not point to Sherlock build directory" + +# Find out sort buffer size +parse-config 'Sorter{##SortBuffer}' +SORTBUF=$CF_Sorter_SortBuffer +[ "$SORTBUF" -gt 0 ] || die "Unable to determine SortBuffer" +log "Detected sort buffer size $SORTBUF" + +# Size of the test -- should be slightly less than a half of SortBuffer +SIZE=$(($SORTBUF/2 - 8192)) +log "Decided to benchmark sorting of $SIZE byte data" + +# Which bit widths we try +WIDTHS="0 6 7 8 9 10 11 12 13 14" + +# Which RadixThresholds we try +THRS="2000 4000 10000 20000 50000" + +# Which sort-test tests we try +TESTS="2,5,8,15" + +# Check various bit widths of the radix sorter +rm -f tmp/radix-* +for W in $WIDTHS ; do + rm -f $BUILD/obj/ucw/sorter/sort-test{,.o} + if [ $W = 0 ] ; then + log "Compiling with no radix splits" + ( cd $BUILD && make obj/ucw/sorter/sort-test ) + OPT="-d32" + else + log "Compiling with $W-bit radix splits" + ( cd $BUILD && make CEXTRA="-DFORCE_RADIX_BITS=$W" obj/ucw/sorter/sort-test ) + OPT= + fi + for THR in $THRS ; do + log "Testing with RadixThreshold=$THR" + $BUILD/obj/ucw/sorter/sort-test -SThreads.DefaultStackSize=2M -SSorter.RadixThreshold=$THR -s$SIZE -t$TESTS $OPT -v 2>&1 | tee -a tmp/radix-$W + done +done + +echo "thresh" >tmp/radix-thrs +echo "test#" >tmp/radix-tests +for THR in $THRS ; do + for TEST in `echo $TESTS | tr ',' ' '` ; do + echo $THR >>tmp/radix-thrs + echo $TEST >>tmp/radix-tests + done +done + +FILES="tmp/radix-thrs tmp/radix-tests" +for W in $WIDTHS ; do + a=tmp/radix-$W + echo >$a.out "$W bits" + sed 's/.* \([0-9.]\+\)s internal sorting.*/\1/;t;d' <$a >>$a.out + FILES="$FILES $a.out" +done + +log "These are the results:" +paste $FILES diff --git a/ucw/sorter/debug/radix-tune-thresh.sh b/ucw/sorter/debug/radix-tune-thresh.sh new file mode 100644 index 00000000..71d929b3 --- /dev/null +++ b/ucw/sorter/debug/radix-tune-thresh.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# An utility for tuning the Sherlock's radix sorter threshold +# (c) 2007 Martin Mares +set -e +UCW_PROGNAME="$0" +. lib/libucw.sh + +# Path to Sherlock build directory +[ -n "$BUILD" ] || BUILD=.. +[ -f "$BUILD/ucw/sorter/sorter.h" ] || die "BUILD does not point to Sherlock build directory" + +# Find out sort buffer size +parse-config 'Sorter{##SortBuffer}' +SORTBUF=$CF_Sorter_SortBuffer +[ "$SORTBUF" -gt 0 ] || die "Unable to determine SortBuffer" +log "Detected sort buffer size $SORTBUF" + +# Find out radix-sorter width +[ -f "$BUILD/obj/config.mk" ] || die "Sherlock source not configured" +WIDTH=`sed <$BUILD/obj/config.mk 's/^CONFIG_UCW_RADIX_SORTER_BITS=\(.*\)/\1/;t;d'` +[ -n "$WIDTH" ] || die "CONFIG_UCW_RADIX_SORTER_BITS not set (!?)" +log "Detected radix-sorter width $WIDTH" + +# Maximum size of the test -- should be slightly less than a half of SortBuffer +SIZE=$(($SORTBUF/2 - 8192)) + +# Which sort-test test we try +TEST="2" + +# Which thresholds we try +THRS="16" +T=$SIZE +while [ $T -gt 100 ] ; do + THRS="$THRS $T" + T=$(($T/2)) +done + +if true ; then + +rm -f tmp/radix-* +echo "sizes" >tmp/radix-sizes +while [ $SIZE -gt 262144 ] ; do + echo $SIZE >>tmp/radix-sizes + for T in $THRS ; do + log "Trying size $SIZE with threshold $T" + $BUILD/obj/ucw/sorter/sort-test -SSorter.RadixThreshold=$T -s$SIZE -t$TEST -v 2>&1 | tee -a tmp/radix-$T + done + SIZE=$(($SIZE/2)) +done + +fi + +FILES=tmp/radix-sizes +for T in $THRS ; do + a=tmp/radix-$T + echo >$a.out $T + sed 's/.* \([0-9.]\+\)s internal sorting.*/\1/;t;d' <$a >>$a.out + FILES="$FILES $a.out" +done + +log "These are the results:" +paste $FILES diff --git a/ucw/sorter/debug/retros.c b/ucw/sorter/debug/retros.c new file mode 100644 index 00000000..7bb692f6 --- /dev/null +++ b/ucw/sorter/debug/retros.c @@ -0,0 +1,763 @@ +/* + * Experiments with various sorting algorithms + * + * (c) 2007--2008 Martin Mares + */ + +#include "sherlock/sherlock.h" +#include "ucw/getopt.h" +#include "ucw/md5.h" +#include "ucw/heap.h" + +#include +#include +#include +#include +#include + +struct elt { + u32 key; + u32 ballast[3]; +}; + +static struct elt *ary, *alt, **ind, *array0, *array1; +static uns n = 10000000; +static u32 sum; + +static struct elt *alloc_elts(uns n) +{ + return big_alloc(n * sizeof(struct elt)); +} + +static void free_elts(struct elt *a, uns n) +{ + big_free(a, n * sizeof(struct elt)); +} + +static int comp(const void *x, const void *y) +{ + const struct elt *xx = x, *yy = y; + return (xx->key < yy->key) ? -1 : (xx->key > yy->key) ? 1 : 0; +} + +static int comp_ind(const void *x, const void *y) +{ + const struct elt * const *xx = x, * const *yy = y; + return comp(*xx, *yy); +} + +#define ASORT_PREFIX(x) as_##x +#define ASORT_KEY_TYPE u32 +#define ASORT_ELT(i) a[i].key +#define ASORT_SWAP(i,j) do { struct elt t=a[i]; a[i]=a[j]; a[j]=t; } while (0) +#define ASORT_EXTRA_ARGS , struct elt *a +#include "ucw/sorter/array-simple.h" + +#define ASORT_PREFIX(x) asi_##x +#define ASORT_KEY_TYPE u32 +#define ASORT_ELT(i) ind[i]->key +#define ASORT_SWAP(i,j) do { struct elt *t=ind[i]; ind[i]=ind[j]; ind[j]=t; } while (0) +#include "ucw/sorter/array-simple.h" + +static void r1_sort(void) +{ + struct elt *from = ary, *to = alt, *tmp; +#define BITS 8 + uns cnt[1 << BITS]; + for (uns sh=0; sh<32; sh+=BITS) + { + bzero(cnt, sizeof(cnt)); + for (uns i=0; i> sh) & ((1 << BITS) - 1)]++; + uns pos = 0; + for (uns i=0; i<(1<> sh) & ((1 << BITS) - 1)]++] = from[i]; + ASSERT(cnt[(1 << BITS)-1] == n); + tmp=from, from=to, to=tmp; + } + ary = from; +#undef BITS +} + +static void r1b_sort(void) +{ + struct elt *from = ary, *to = alt, *tmp; +#define BITS 8 + uns cnt[1 << BITS], cnt2[1 << BITS]; + for (uns sh=0; sh<32; sh+=BITS) + { + if (sh) + memcpy(cnt, cnt2, sizeof(cnt)); + else + { + bzero(cnt, sizeof(cnt)); + for (uns i=0; i> sh) & ((1 << BITS) - 1)]++; + } + uns pos = 0; + for (uns i=0; i<(1<> (sh + BITS)) & ((1 << BITS) - 1)]++; + to[cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++] = from[i]; + } + ASSERT(cnt[(1 << BITS)-1] == n); + tmp=from, from=to, to=tmp; + } + ary = from; +#undef BITS +} + +static void r1c_sort(void) +{ + uns cnt[256]; + struct elt *ptrs[256], *x, *lim; + + x = ary; lim = ary + n; + bzero(cnt, sizeof(cnt)); + while (x < lim) + cnt[x++->key & 255]++; + +#define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; } + + PTRS(alt); + x = ary; lim = ary + n; + bzero(cnt, sizeof(cnt)); + while (x < lim) + { + cnt[(x->key >> 8) & 255]++; + *ptrs[x->key & 255]++ = *x; + x++; + } + + PTRS(ary); + x = alt; lim = alt + n; + bzero(cnt, sizeof(cnt)); + while (x < lim) + { + cnt[(x->key >> 16) & 255]++; + *ptrs[(x->key >> 8) & 255]++ = *x; + x++; + } + + PTRS(alt); + x = ary; lim = ary + n; + bzero(cnt, sizeof(cnt)); + while (x < lim) + { + cnt[(x->key >> 24) & 255]++; + *ptrs[(x->key >> 16) & 255]++ = *x; + x++; + } + + PTRS(ary); + x = alt; lim = alt + n; + while (x < lim) + { + *ptrs[(x->key >> 24) & 255]++ = *x; + x++; + } +#undef PTRS +} + +#include + +static inline void sse_copy_elt(struct elt *to, struct elt *from) +{ + __m128i m = _mm_load_si128((__m128i *) from); + _mm_store_si128((__m128i *) to, m); +} + +static void r1c_sse_sort(void) +{ + uns cnt[256]; + struct elt *ptrs[256], *x, *lim; + + ASSERT(sizeof(struct elt) == 16); + ASSERT(!((uintptr_t)alt & 15)); + ASSERT(!((uintptr_t)ary & 15)); + + x = ary; lim = ary + n; + bzero(cnt, sizeof(cnt)); + while (x < lim) + cnt[x++->key & 255]++; + +#define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; } + + PTRS(alt); + x = ary; lim = ary + n; + bzero(cnt, sizeof(cnt)); + while (x < lim) + { + cnt[(x->key >> 8) & 255]++; + sse_copy_elt(ptrs[x->key & 255]++, x); + x++; + } + + PTRS(ary); + x = alt; lim = alt + n; + bzero(cnt, sizeof(cnt)); + while (x < lim) + { + cnt[(x->key >> 16) & 255]++; + sse_copy_elt(ptrs[(x->key >> 8) & 255]++, x); + x++; + } + + PTRS(alt); + x = ary; lim = ary + n; + bzero(cnt, sizeof(cnt)); + while (x < lim) + { + cnt[(x->key >> 24) & 255]++; + sse_copy_elt(ptrs[(x->key >> 16) & 255]++, x); + x++; + } + + PTRS(ary); + x = alt; lim = alt + n; + while (x < lim) + { + sse_copy_elt(ptrs[(x->key >> 24) & 255]++, x); + x++; + } +#undef PTRS +} + +static void r1d_sort(void) +{ + uns cnt[256]; + struct elt *ptrs[256], *x, *y, *lim; + + ASSERT(!(n % 4)); + + x = ary; lim = ary + n; + bzero(cnt, sizeof(cnt)); + while (x < lim) + { + cnt[x++->key & 255]++; + cnt[x++->key & 255]++; + cnt[x++->key & 255]++; + cnt[x++->key & 255]++; + } + +#define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; } + + PTRS(alt); + x = ary; y = ary+n/2; lim = ary + n/2; + bzero(cnt, sizeof(cnt)); + while (x < lim) + { + cnt[(x->key >> 8) & 255]++; + cnt[(y->key >> 8) & 255]++; + *ptrs[x->key & 255]++ = *x; + *ptrs[y->key & 255]++ = *y; + x++, y++; + cnt[(x->key >> 8) & 255]++; + cnt[(y->key >> 8) & 255]++; + *ptrs[x->key & 255]++ = *x; + *ptrs[y->key & 255]++ = *y; + x++, y++; + } + + PTRS(ary); + x = alt; lim = alt + n; + bzero(cnt, sizeof(cnt)); + while (x < lim) + { + cnt[(x->key >> 16) & 255]++; + *ptrs[(x->key >> 8) & 255]++ = *x; + x++; + cnt[(x->key >> 16) & 255]++; + *ptrs[(x->key >> 8) & 255]++ = *x; + x++; + } + + PTRS(alt); + x = ary; lim = ary + n; + bzero(cnt, sizeof(cnt)); + while (x < lim) + { + cnt[(x->key >> 24) & 255]++; + *ptrs[(x->key >> 16) & 255]++ = *x; + x++; + cnt[(x->key >> 24) & 255]++; + *ptrs[(x->key >> 16) & 255]++ = *x; + x++; + } + + PTRS(ary); + x = alt; lim = alt + n; + while (x < lim) + { + *ptrs[(x->key >> 24) & 255]++ = *x; + x++; + *ptrs[(x->key >> 24) & 255]++ = *x; + x++; + } +#undef PTRS +} + +static void r2_sort(void) +{ + struct elt *from = ary, *to = alt; +#define BITS 14 + uns cnt[1 << BITS]; + bzero(cnt, sizeof(cnt)); + for (uns i=0; i> (32 - BITS)) & ((1 << BITS) - 1)]++; + uns pos = 0; + for (uns i=0; i<(1<> (32 - BITS)) & ((1 << BITS) - 1)]++] = from[i]; + ASSERT(cnt[(1 << BITS)-1] == n); + + pos = 0; + for (uns i=0; i<(1 << BITS); i++) + { + as_sort(cnt[i] - pos, alt+pos); + pos = cnt[i]; + } + ary = alt; +#undef BITS +} + +static void r3_sort(void) +{ +#define BITS 10 +#define LEVELS 2 +#define BUCKS (1 << BITS) +#define THRESHOLD 5000 +#define ODDEVEN 0 + + auto void r3(struct elt *from, struct elt *to, uns n, uns lev); + void r3(struct elt *from, struct elt *to, uns n, uns lev) + { + uns sh = 32 - lev*BITS; + uns cnt[BUCKS]; + bzero(cnt, sizeof(cnt)); + for (uns i=0; i> sh) & (BUCKS - 1)]++; + uns pos = 0; + for (uns i=0; i> sh) & (BUCKS - 1)]++] = from[i]; +#else + sse_copy_elt(&to[cnt[(from[i].key >> sh) & (BUCKS - 1)]++], &from[i]); +#endif + pos = 0; + for (uns i=0; i= LEVELS || l <= THRESHOLD) + { + as_sort(l, to+pos); + if ((lev % 2) != ODDEVEN) + memcpy(from+pos, to+pos, l * sizeof(struct elt)); + } + else + r3(to+pos, from+pos, l, lev+1); + pos = cnt[i]; + } + } + + r3(ary, alt, n, 1); + if (ODDEVEN) + ary = alt; + +#undef ODDEVEN +#undef THRESHOLD +#undef BUCKS +#undef LEVELS +#undef BITS +} + +static inline struct elt *mrg(struct elt *x, struct elt *xl, struct elt *y, struct elt *yl, struct elt *z) +{ + for (;;) + { + if (x->key <= y->key) + { + *z++ = *x++; + if (x >= xl) + goto xend; + } + else + { + *z++ = *y++; + if (y >= yl) + goto yend; + } + } + + xend: + while (y < yl) + *z++ = *y++; + return z; + + yend: + while (x < xl) + *z++ = *x++; + return z; +} + +static void mergesort(void) +{ + struct elt *from, *to; + uns lev = 0; + if (1) + { + struct elt *x = ary, *z = alt, *last = ary + (n & ~1U); + while (x < last) + { + if (x[0].key < x[1].key) + *z++ = *x++, *z++ = *x++; + else + { + *z++ = x[1]; + *z++ = x[0]; + x += 2; + } + } + if (n % 2) + *z = *x; + lev++; + } + for (; (1U << lev) < n; lev++) + { + if (lev % 2) + from = alt, to = ary; + else + from = ary, to = alt; + struct elt *x, *z, *last; + x = from; + z = to; + last = from + n; + uns step = 1 << lev; + while (x + 2*step <= last) + { + z = mrg(x, x+step, x+step, x+2*step, z); + x += 2*step; + } + if (x + step < last) + mrg(x, x+step, x+step, last, z); + else + memcpy(z, x, (byte*)last - (byte*)x); + } + if (lev % 2) + ary = alt; +} + +static void sampsort(uns n, struct elt *ar, struct elt *al, struct elt *dest, byte *wbuf) +{ +#define WAYS 256 + struct elt k[WAYS]; + uns cnt[WAYS]; + bzero(cnt, sizeof(cnt)); + for (uns i=0; i k[w+delta].key) w += delta + FW(128); + FW(64); + FW(32); + FW(16); + FW(8); + FW(4); + FW(2); + FW(1); + wbuf[i] = w; + cnt[w]++; + } + struct elt *y = al, *way[WAYS], *z; + for (uns i=0; i= 1000) + sampsort(cnt[i], y, z, dest, wbuf); + else + { + as_sort(cnt[i], y); + if (al != dest) + memcpy(z, y, cnt[i]*sizeof(struct elt)); + } + y += cnt[i]; + z += cnt[i]; + } +#undef FW +#undef WAYS +} + +static void samplesort(void) +{ + byte *aux = xmalloc(n); + sampsort(n, ary, alt, ary, aux); + xfree(aux); +} + +static void sampsort2(uns n, struct elt *ar, struct elt *al, struct elt *dest, byte *wbuf) +{ +#define WAYS 256 + struct elt k[WAYS]; + uns cnt[WAYS]; + bzero(cnt, sizeof(cnt)); + for (uns i=0; ikey > k[w1+delta].key) w1 += delta +#define FW2(delta) if (k2->key > k[w2+delta].key) w2 += delta + FW1(128); FW2(128); + FW1(64); FW2(64); + FW1(32); FW2(32); + FW1(16); FW2(16); + FW1(8); FW2(8); + FW1(4); FW2(4); + FW1(2); FW2(2); + FW1(1); FW2(1); + *ww++ = w1; + *ww++ = w2; + cnt[w1]++; + cnt[w2]++; + k1 += 2; + k2 += 2; + } + if (k1 < kend) + { + uns w1 = 0; + FW1(128); FW1(64); FW1(32); FW1(16); + FW1(8); FW1(4); FW1(2); FW1(1); + *ww++ = w1; + cnt[w1]++; + } + struct elt *y = al, *way[WAYS], *z; + for (uns i=0; i= 1000) + sampsort2(cnt[i], y, z, dest, wbuf); + else + { + as_sort(cnt[i], y); + if (al != dest) + memcpy(z, y, cnt[i]*sizeof(struct elt)); + } + y += cnt[i]; + z += cnt[i]; + } +#undef FW1 +#undef FW2 +#undef WAYS +} + +static void samplesort2(void) +{ + byte *aux = xmalloc(n); + sampsort2(n, ary, alt, ary, aux); + xfree(aux); +} + +static void heapsort(void) +{ +#define H_LESS(_a,_b) ((_a).key > (_b).key) + struct elt *heap = ary-1; + HEAP_INIT(struct elt, heap, n, H_LESS, HEAP_SWAP); + uns nn = n; + while (nn) + HEAP_DELMIN(struct elt, heap, nn, H_LESS, HEAP_SWAP); +#undef H_LESS +} + +static void heapsort_ind(void) +{ +#define H_LESS(_a,_b) ((_a)->key > (_b)->key) + struct elt **heap = ind-1; + HEAP_INIT(struct elt *, heap, n, H_LESS, HEAP_SWAP); + uns nn = n; + while (nn) + HEAP_DELMIN(struct elt *, heap, nn, H_LESS, HEAP_SWAP); +#undef H_LESS +} + +static void mk_ary(void) +{ + ary = array0; + alt = array1; + md5_context ctx; + md5_init(&ctx); + u32 block[16]; + bzero(block, sizeof(block)); + + sum = 0; + for (uns i=0; ikey; + for (uns i=1; ikey < ind[i-1]->key) + die("Missorted at %d", i); + else + s ^= ind[i]->key; + if (s != sum) + die("Corrupted"); + xfree(ind); +} + +int main(int argc, char **argv) +{ + log_init(argv[0]); + + int opt; + uns op = 0; + while ((opt = cf_getopt(argc, argv, CF_SHORT_OPTS "1", CF_NO_LONG_OPTS, NULL)) >= 0) + switch (opt) + { + case '1': + op |= (1 << (opt - '0')); + break; + default: + die("usage?"); + } + + array0 = alloc_elts(n); + array1 = alloc_elts(n); + for (uns i=0; i