+++ /dev/null
-How retros runs on different hardware:
-
-# 32-bit Athlon 64, gcc-4.1
-mj@albireo:~/src/sh/dev-sorter/run$ bin/retros
-D 2006-11-23 23:17:36 [retros] memcpy: 212
-D 2006-11-23 23:17:44 [retros] qsort: 6947
-D 2006-11-23 23:17:48 [retros] arraysort: 3183
-D 2006-11-23 23:18:02 [retros] indirect qsort: 13116
-D 2006-11-23 23:18:24 [retros] indirect arraysort: 19176
-D 2006-11-23 23:18:30 [retros] radix1: 3755
-D 2006-11-23 23:18:34 [retros] radix1b: 3100
-D 2006-11-23 23:18:39 [retros] radix1c: 2777
-D 2006-11-23 23:18:43 [retros] radix1c-sse: 2602
-D 2006-11-23 23:18:47 [retros] radix1d: 2728
-D 2006-11-23 23:18:53 [retros] radix2: 4249
-D 2006-11-23 23:18:57 [retros] radix3: 2577
-D 2006-11-23 23:19:09 [retros] mergesort: 10399
-D 2006-11-23 23:19:16 [retros] samplesort: 5698
-D 2006-11-23 23:19:23 [retros] samplesort2: 5016
-
-# 32-bit P4 Xeon, gcc-3.4
-sherlock@sherlock3:~/sherlock-mj/run$ bin/retros
-D 2006-11-23 23:23:52 [retros] memcpy: 198
-D 2006-11-23 23:24:23 [retros] qsort: 30114
-D 2006-11-23 23:24:27 [retros] arraysort: 2882
-D 2006-11-23 23:24:43 [retros] indirect qsort: 15019
-D 2006-11-23 23:24:59 [retros] indirect arraysort: 13267
-D 2006-11-23 23:25:03 [retros] radix1: 1881
-D 2006-11-23 23:25:06 [retros] radix1b: 1442
-D 2006-11-23 23:25:08 [retros] radix1c: 1313
-D 2006-11-23 23:25:10 [retros] radix1c-sse: 1229
-D 2006-11-23 23:25:13 [retros] radix1d: 1324
-D 2006-11-23 23:25:17 [retros] radix2: 2598
-D 2006-11-23 23:25:19 [retros] radix3: 1419
-D 2006-11-23 23:25:25 [retros] mergesort: 4929
-D 2006-11-23 23:25:29 [retros] samplesort: 2742
-D 2006-11-23 23:25:33 [retros] samplesort2: 2350
-
-# 64-bit P4 Xeon, gcc-3.4
-sherlock@sherlock4:~/sherlock-3.10/run$ bin/retros
-D 2006-11-23 23:44:31 [retros] memcpy: 132
-D 2006-11-23 23:44:58 [retros] qsort: 26469
-D 2006-11-23 23:45:01 [retros] arraysort: 2307
-D 2006-11-23 23:45:12 [retros] indirect qsort: 10971
-D 2006-11-23 23:45:24 [retros] indirect arraysort: 10350
-D 2006-11-23 23:45:26 [retros] radix1: 1099
-D 2006-11-23 23:45:27 [retros] radix1b: 1052
-D 2006-11-23 23:45:29 [retros] radix1c: 1017
-D 2006-11-23 23:45:30 [retros] radix1c-sse: 1017
-D 2006-11-23 23:45:32 [retros] radix1d: 1016
-D 2006-11-23 23:45:34 [retros] radix2: 1661
-D 2006-11-23 23:45:36 [retros] radix3: 955
-D 2006-11-23 23:45:39 [retros] mergesort: 3302
-D 2006-11-23 23:45:42 [retros] samplesort: 2376
-D 2006-11-23 23:45:45 [retros] samplesort2: 1870
-
-# 64-bit Turion X2 TL52, gcc-4.1.1
-pchar@paja ~/prog/sherlock-dev-sorter/run $ bin/retros
-D 2006-11-24 00:32:38 [retros] memcpy: 93
-D 2006-11-24 00:32:46 [retros] qsort: 7530
-D 2006-11-24 00:32:50 [retros] arraysort: 2766
-D 2006-11-24 00:33:01 [retros] indirect qsort: 10543
-D 2006-11-24 00:33:13 [retros] indirect arraysort: 10169
-D 2006-11-24 00:33:16 [retros] radix1: 1319
-D 2006-11-24 00:33:18 [retros] radix1b: 1126
-D 2006-11-24 00:33:20 [retros] radix1c: 1084
-D 2006-11-24 00:33:22 [retros] radix1c-sse: 1126
-D 2006-11-24 00:33:24 [retros] radix1d: 1091
-D 2006-11-24 00:33:27 [retros] radix2: 2238
-D 2006-11-24 00:33:29 [retros] radix3: 1183
-D 2006-11-24 00:33:34 [retros] mergesort: 4036
-D 2006-11-24 00:33:37 [retros] samplesort: 2594
-D 2006-11-24 00:33:40 [retros] samplesort2: 2214
+++ /dev/null
-# Tests related to the new sorter
-
-DIRS+=debug/sorter
-PROGS+=$(addprefix $(o)/debug/sorter/,radix-tune-bits radix-tune-thresh)
-
-$(o)/debug/sorter/retros: $(o)/debug/sorter/retros.o $(LIBSH)
-$(o)/debug/sorter/radix-file-test: $(o)/debug/sorter/radix-file-test.o $(LIBSH)
-$(o)/debug/sorter/radix-asio-test: $(o)/debug/sorter/radix-asio-test.o $(LIBSH)
-$(o)/debug/sorter/radix-tune-bits: $(s)/debug/sorter/radix-tune-bits.sh
-$(o)/debug/sorter/radix-tune-thresh: $(s)/debug/sorter/radix-tune-thresh.sh
+++ /dev/null
-/*
- * An experiment with parallel reading and writing of files using ASIO.
- *
- * (c) 2007 Martin Mares <mj@ucw.cz>
- */
-
-#include "ucw/lib.h"
-#include "ucw/conf.h"
-#include "ucw/lfs.h"
-#include "ucw/asio.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <fcntl.h>
-#include <unistd.h>
-
-#define COPY
-#define DIRECT O_DIRECT
-
-static timestamp_t timer;
-
-#define P_INIT do { cnt = 0; cnt_rep = 0; cnt_ms = 1; } while(0)
-#define P_UPDATE(cc) do { \
- cnt += cc; \
- if (cnt >= cnt_rep) { cnt_ms += get_timer(&timer); \
- printf("%d of %d MB (%.2f MB/sec)\r", (int)(cnt >> 20), (int)(total_size >> 20), (double)cnt / 1048576 * 1000 / cnt_ms); \
- fflush(stdout); cnt_rep += 1<<26; } } while(0)
-#define P_FINAL do { \
- cnt_ms += get_timer(&timer); \
- msg(L_INFO, "Spent %.3f sec (%.2f MB/sec)", (double)cnt_ms/1000, (double)cnt / 1048576 * 1000 / cnt_ms); \
-} while(0)
-
-static struct asio_queue io_queue;
-
-int main(int argc, char **argv)
-{
- uns files, bufsize;
- u64 total_size;
- if (argc != 4 ||
- cf_parse_int(argv[1], (int*) &files) ||
- cf_parse_int(argv[2], (int*) &bufsize) ||
- cf_parse_u64(argv[3], &total_size))
- {
- fprintf(stderr, "Usage: asio-test <nr-files> <bufsize> <totalsize>\n");
- return 1;
- }
- u64 cnt, cnt_rep;
- uns cnt_ms;
- int fd[files];
- byte name[files][16];
- struct asio_request *req[files];
-
- init_timer(&timer);
-
- io_queue.buffer_size = bufsize;
- io_queue.max_writebacks = 2;
- asio_init_queue(&io_queue);
-
-#ifdef COPY
- msg(L_INFO, "Creating input file");
- int in_fd = ucw_open("tmp/ft-in", O_RDWR | O_CREAT | O_TRUNC | DIRECT, 0666);
- ASSERT(in_fd >= 0);
- ASSERT(!(total_size % bufsize));
- P_INIT;
- for (uns i=0; i<total_size/bufsize; i++)
- {
- struct asio_request *r = asio_get(&io_queue);
- r->op = ASIO_WRITE_BACK;
- r->fd = in_fd;
- r->len = bufsize;
- byte *xbuf = r->buffer;
- for (uns j=0; j<bufsize; j++)
- xbuf[j] = i+j;
- asio_submit(r);
- P_UPDATE(bufsize);
- }
- asio_sync(&io_queue);
- lseek(in_fd, 0, SEEK_SET);
- sync();
- P_FINAL;
-#endif
-
- msg(L_INFO, "Initializing output files");
- for (uns i=0; i<files; i++)
- {
- sprintf(name[i], "tmp/ft-%d", i);
- fd[i] = ucw_open(name[i], O_RDWR | O_CREAT | O_TRUNC | DIRECT, 0666);
- if (fd[i] < 0)
- die("Cannot create %s: %m", name[i]);
- }
- sync();
- get_timer(&timer);
-
- msg(L_INFO, "Writing %d MB to %d files in parallel with %d byte buffers", (int)(total_size >> 20), files, bufsize);
- P_INIT;
- for (uns i=0; i<files; i++)
- req[i] = asio_get(&io_queue);
- for (uns round=0; round<total_size/bufsize/files; round++)
- {
- for (uns i=0; i<files; i++)
- {
- struct asio_request *r = req[i];
-#ifdef COPY
- struct asio_request *rr, *rd = asio_get(&io_queue);
- rd->op = ASIO_READ;
- rd->fd = in_fd;
- rd->len = bufsize;
- asio_submit(rd);
- rr = asio_wait(&io_queue);
- ASSERT(rr == rd && rd->status == (int)rd->len);
- memcpy(r->buffer, rd->buffer, bufsize);
- asio_put(rr);
-#else
- for (uns j=0; j<bufsize; j++)
- r->buffer[j] = round+i+j;
-#endif
- r->op = ASIO_WRITE_BACK;
- r->fd = fd[i];
- r->len = bufsize;
- asio_submit(r);
- P_UPDATE(bufsize);
- req[i] = asio_get(&io_queue);
- }
- }
- for (uns i=0; i<files; i++)
- asio_put(req[i]);
- asio_sync(&io_queue);
-#ifdef COPY
- close(in_fd);
-#endif
- msg(L_INFO, "Syncing");
- sync();
- P_FINAL;
-
- msg(L_INFO, "Reading the files sequentially");
- P_INIT;
- for (uns i=0; i<files; i++)
- {
- lseek(fd[i], 0, SEEK_SET);
- for (uns round=0; round<total_size/bufsize/files; round++)
- {
- struct asio_request *rr, *r = asio_get(&io_queue);
- r->op = ASIO_READ;
- r->fd = fd[i];
- r->len = bufsize;
- asio_submit(r);
- rr = asio_wait(&io_queue);
- ASSERT(rr == r && r->status == (int)bufsize);
- asio_put(r);
- P_UPDATE(bufsize);
- }
- close(fd[i]);
- }
- P_FINAL;
-
- for (uns i=0; i<files; i++)
- unlink(name[i]);
-#ifdef COPY
- unlink("tmp/ft-in");
-#endif
-
- asio_cleanup_queue(&io_queue);
- msg(L_INFO, "Done");
- return 0;
-}
+++ /dev/null
-/*
- * An experiment with parallel reading and writing of files.
- *
- * (c) 2007 Martin Mares <mj@ucw.cz>
- */
-
-#include "ucw/lib.h"
-#include "ucw/conf.h"
-#include "ucw/lfs.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <fcntl.h>
-#include <unistd.h>
-
-#define COPY
-#define DIRECT 0 // or O_DIRECT
-
-static timestamp_t timer;
-
-#define P_INIT do { cnt = 0; cnt_rep = 0; cnt_ms = 1; } while(0)
-#define P_UPDATE(cc) do { \
- cnt += cc; \
- if (cnt >= cnt_rep) { cnt_ms += get_timer(&timer); \
- printf("%d of %d MB (%.2f MB/sec)\r", (int)(cnt >> 20), (int)(total_size >> 20), (double)cnt / 1048576 * 1000 / cnt_ms); \
- fflush(stdout); cnt_rep += 1<<26; } } while(0)
-#define P_FINAL do { \
- cnt_ms += get_timer(&timer); \
- msg(L_INFO, "Spent %.3f sec (%.2f MB/sec)", (double)cnt_ms/1000, (double)cnt / 1048576 * 1000 / cnt_ms); \
-} while(0)
-
-int main(int argc, char **argv)
-{
- uns files, bufsize;
- u64 total_size;
- if (argc != 4 ||
- cf_parse_int(argv[1], (int*) &files) ||
- cf_parse_int(argv[2], (int*) &bufsize) ||
- cf_parse_u64(argv[3], &total_size))
- {
- fprintf(stderr, "Usage: file-test <nr-files> <bufsize> <totalsize>\n");
- return 1;
- }
- u64 cnt, cnt_rep;
- uns cnt_ms;
- int fd[files];
- byte *buf[files], name[files][16];
- uns xbufsize = bufsize; // Used for single-file I/O
- byte *xbuf = big_alloc(xbufsize);
-
- init_timer(&timer);
-
-#ifdef COPY
- msg(L_INFO, "Creating input file");
- int in_fd = ucw_open("tmp/ft-in", O_RDWR | O_CREAT | O_TRUNC | DIRECT, 0666);
- ASSERT(in_fd >= 0);
- ASSERT(!(total_size % xbufsize));
- P_INIT;
- for (uns i=0; i<total_size/xbufsize; i++)
- {
- for (uns j=0; j<xbufsize; j++)
- xbuf[j] = i+j;
- uns c = write(in_fd, xbuf, xbufsize);
- ASSERT(c == xbufsize);
- P_UPDATE(c);
- }
- lseek(in_fd, 0, SEEK_SET);
- sync();
- P_FINAL;
-#endif
-
- msg(L_INFO, "Initializing output files");
- for (uns i=0; i<files; i++)
- {
- sprintf(name[i], "tmp/ft-%d", i);
- fd[i] = ucw_open(name[i], O_RDWR | O_CREAT | O_TRUNC | DIRECT, 0666);
- if (fd[i] < 0)
- die("Cannot create %s: %m", name[i]);
- buf[i] = big_alloc(bufsize);
- }
- sync();
- get_timer(&timer);
-
- msg(L_INFO, "Writing %d MB to %d files in parallel with %d byte buffers", (int)(total_size >> 20), files, bufsize);
- P_INIT;
- for (uns r=0; r<total_size/bufsize/files; r++)
- {
- for (uns i=0; i<files; i++)
- {
-#ifdef COPY
- uns ci = read(in_fd, buf[i], bufsize);
- ASSERT(ci == bufsize);
-#else
- for (uns j=0; j<bufsize; j++)
- buf[i][j] = r+i+j;
-#endif
- uns c = write(fd[i], buf[i], bufsize);
- ASSERT(c == bufsize);
- P_UPDATE(c);
- }
- }
-#ifdef COPY
- close(in_fd);
-#endif
- msg(L_INFO, "Syncing");
- sync();
- P_FINAL;
-
- msg(L_INFO, "Reading the files sequentially");
- P_INIT;
- for (uns i=0; i<files; i++)
- {
- lseek(fd[i], 0, SEEK_SET);
- for (uns r=0; r<total_size/xbufsize/files; r++)
- {
- uns c = read(fd[i], xbuf, xbufsize);
- ASSERT(c == xbufsize);
- P_UPDATE(c);
- }
- close(fd[i]);
- }
- P_FINAL;
-
- for (uns i=0; i<files; i++)
- unlink(name[i]);
-#ifdef COPY
- unlink("tmp/ft-in");
-#endif
- msg(L_INFO, "Done");
- return 0;
-}
+++ /dev/null
-#!/bin/bash
-# An utility for tuning the Sherlock's radix sorter
-# (c) 2007 Martin Mares <mj@ucw.cz>
-set -e
-UCW_PROGNAME="$0"
-. lib/libucw.sh
-
-# Path to Sherlock build directory
-[ -n "$BUILD" ] || BUILD=..
-[ -f "$BUILD/ucw/sorter/sorter.h" ] || die "BUILD does not point to Sherlock build directory"
-
-# Find out sort buffer size
-parse-config 'Sorter{##SortBuffer}'
-SORTBUF=$CF_Sorter_SortBuffer
-[ "$SORTBUF" -gt 0 ] || die "Unable to determine SortBuffer"
-log "Detected sort buffer size $SORTBUF"
-
-# Size of the test -- should be slightly less than a half of SortBuffer
-SIZE=$(($SORTBUF/2 - 8192))
-log "Decided to benchmark sorting of $SIZE byte data"
-
-# Which bit widths we try
-WIDTHS="0 6 7 8 9 10 11 12 13 14"
-
-# Which RadixThresholds we try
-THRS="2000 4000 10000 20000 50000"
-
-# Which sort-test tests we try
-TESTS="2,5,8,15"
-
-# Check various bit widths of the radix sorter
-rm -f tmp/radix-*
-for W in $WIDTHS ; do
- rm -f $BUILD/obj/ucw/sorter/sort-test{,.o}
- if [ $W = 0 ] ; then
- log "Compiling with no radix splits"
- ( cd $BUILD && make obj/ucw/sorter/sort-test )
- OPT="-d32"
- else
- log "Compiling with $W-bit radix splits"
- ( cd $BUILD && make CEXTRA="-DFORCE_RADIX_BITS=$W" obj/ucw/sorter/sort-test )
- OPT=
- fi
- for THR in $THRS ; do
- log "Testing with RadixThreshold=$THR"
- $BUILD/obj/ucw/sorter/sort-test -SThreads.DefaultStackSize=2M -SSorter.RadixThreshold=$THR -s$SIZE -t$TESTS $OPT -v 2>&1 | tee -a tmp/radix-$W
- done
-done
-
-echo "thresh" >tmp/radix-thrs
-echo "test#" >tmp/radix-tests
-for THR in $THRS ; do
- for TEST in `echo $TESTS | tr ',' ' '` ; do
- echo $THR >>tmp/radix-thrs
- echo $TEST >>tmp/radix-tests
- done
-done
-
-FILES="tmp/radix-thrs tmp/radix-tests"
-for W in $WIDTHS ; do
- a=tmp/radix-$W
- echo >$a.out "$W bits"
- sed 's/.* \([0-9.]\+\)s internal sorting.*/\1/;t;d' <$a >>$a.out
- FILES="$FILES $a.out"
-done
-
-log "These are the results:"
-paste $FILES
+++ /dev/null
-#!/bin/bash
-# An utility for tuning the Sherlock's radix sorter threshold
-# (c) 2007 Martin Mares <mj@ucw.cz>
-set -e
-UCW_PROGNAME="$0"
-. lib/libucw.sh
-
-# Path to Sherlock build directory
-[ -n "$BUILD" ] || BUILD=..
-[ -f "$BUILD/ucw/sorter/sorter.h" ] || die "BUILD does not point to Sherlock build directory"
-
-# Find out sort buffer size
-parse-config 'Sorter{##SortBuffer}'
-SORTBUF=$CF_Sorter_SortBuffer
-[ "$SORTBUF" -gt 0 ] || die "Unable to determine SortBuffer"
-log "Detected sort buffer size $SORTBUF"
-
-# Find out radix-sorter width
-[ -f "$BUILD/obj/config.mk" ] || die "Sherlock source not configured"
-WIDTH=`sed <$BUILD/obj/config.mk 's/^CONFIG_UCW_RADIX_SORTER_BITS=\(.*\)/\1/;t;d'`
-[ -n "$WIDTH" ] || die "CONFIG_UCW_RADIX_SORTER_BITS not set (!?)"
-log "Detected radix-sorter width $WIDTH"
-
-# Maximum size of the test -- should be slightly less than a half of SortBuffer
-SIZE=$(($SORTBUF/2 - 8192))
-
-# Which sort-test test we try
-TEST="2"
-
-# Which thresholds we try
-THRS="16"
-T=$SIZE
-while [ $T -gt 100 ] ; do
- THRS="$THRS $T"
- T=$(($T/2))
-done
-
-if true ; then
-
-rm -f tmp/radix-*
-echo "sizes" >tmp/radix-sizes
-while [ $SIZE -gt 262144 ] ; do
- echo $SIZE >>tmp/radix-sizes
- for T in $THRS ; do
- log "Trying size $SIZE with threshold $T"
- $BUILD/obj/ucw/sorter/sort-test -SSorter.RadixThreshold=$T -s$SIZE -t$TEST -v 2>&1 | tee -a tmp/radix-$T
- done
- SIZE=$(($SIZE/2))
-done
-
-fi
-
-FILES=tmp/radix-sizes
-for T in $THRS ; do
- a=tmp/radix-$T
- echo >$a.out $T
- sed 's/.* \([0-9.]\+\)s internal sorting.*/\1/;t;d' <$a >>$a.out
- FILES="$FILES $a.out"
-done
-
-log "These are the results:"
-paste $FILES
+++ /dev/null
-/*
- * Experiments with various sorting algorithms
- *
- * (c) 2007--2008 Martin Mares <mj@ucw.cz>
- */
-
-#include "sherlock/sherlock.h"
-#include "ucw/getopt.h"
-#include "ucw/md5.h"
-#include "ucw/heap.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <sys/user.h>
-
-struct elt {
- u32 key;
- u32 ballast[3];
-};
-
-static struct elt *ary, *alt, **ind, *array0, *array1;
-static uns n = 10000000;
-static u32 sum;
-
-static struct elt *alloc_elts(uns n)
-{
- return big_alloc(n * sizeof(struct elt));
-}
-
-static void free_elts(struct elt *a, uns n)
-{
- big_free(a, n * sizeof(struct elt));
-}
-
-static int comp(const void *x, const void *y)
-{
- const struct elt *xx = x, *yy = y;
- return (xx->key < yy->key) ? -1 : (xx->key > yy->key) ? 1 : 0;
-}
-
-static int comp_ind(const void *x, const void *y)
-{
- const struct elt * const *xx = x, * const *yy = y;
- return comp(*xx, *yy);
-}
-
-#define ASORT_PREFIX(x) as_##x
-#define ASORT_KEY_TYPE u32
-#define ASORT_ELT(i) a[i].key
-#define ASORT_SWAP(i,j) do { struct elt t=a[i]; a[i]=a[j]; a[j]=t; } while (0)
-#define ASORT_EXTRA_ARGS , struct elt *a
-#include "ucw/sorter/array-simple.h"
-
-#define ASORT_PREFIX(x) asi_##x
-#define ASORT_KEY_TYPE u32
-#define ASORT_ELT(i) ind[i]->key
-#define ASORT_SWAP(i,j) do { struct elt *t=ind[i]; ind[i]=ind[j]; ind[j]=t; } while (0)
-#include "ucw/sorter/array-simple.h"
-
-static void r1_sort(void)
-{
- struct elt *from = ary, *to = alt, *tmp;
-#define BITS 8
- uns cnt[1 << BITS];
- for (uns sh=0; sh<32; sh+=BITS)
- {
- bzero(cnt, sizeof(cnt));
- for (uns i=0; i<n; i++)
- cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++;
- uns pos = 0;
- for (uns i=0; i<(1<<BITS); i++)
- {
- uns c = cnt[i];
- cnt[i] = pos;
- pos += c;
- }
- ASSERT(pos == n);
- for (uns i=0; i<n; i++)
- to[cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++] = from[i];
- ASSERT(cnt[(1 << BITS)-1] == n);
- tmp=from, from=to, to=tmp;
- }
- ary = from;
-#undef BITS
-}
-
-static void r1b_sort(void)
-{
- struct elt *from = ary, *to = alt, *tmp;
-#define BITS 8
- uns cnt[1 << BITS], cnt2[1 << BITS];
- for (uns sh=0; sh<32; sh+=BITS)
- {
- if (sh)
- memcpy(cnt, cnt2, sizeof(cnt));
- else
- {
- bzero(cnt, sizeof(cnt));
- for (uns i=0; i<n; i++)
- cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++;
- }
- uns pos = 0;
- for (uns i=0; i<(1<<BITS); i++)
- {
- uns c = cnt[i];
- cnt[i] = pos;
- pos += c;
- }
- ASSERT(pos == n);
- bzero(cnt2, sizeof(cnt2));
- for (uns i=0; i<n; i++)
- {
- cnt2[(from[i].key >> (sh + BITS)) & ((1 << BITS) - 1)]++;
- to[cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++] = from[i];
- }
- ASSERT(cnt[(1 << BITS)-1] == n);
- tmp=from, from=to, to=tmp;
- }
- ary = from;
-#undef BITS
-}
-
-static void r1c_sort(void)
-{
- uns cnt[256];
- struct elt *ptrs[256], *x, *lim;
-
- x = ary; lim = ary + n;
- bzero(cnt, sizeof(cnt));
- while (x < lim)
- cnt[x++->key & 255]++;
-
-#define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
-
- PTRS(alt);
- x = ary; lim = ary + n;
- bzero(cnt, sizeof(cnt));
- while (x < lim)
- {
- cnt[(x->key >> 8) & 255]++;
- *ptrs[x->key & 255]++ = *x;
- x++;
- }
-
- PTRS(ary);
- x = alt; lim = alt + n;
- bzero(cnt, sizeof(cnt));
- while (x < lim)
- {
- cnt[(x->key >> 16) & 255]++;
- *ptrs[(x->key >> 8) & 255]++ = *x;
- x++;
- }
-
- PTRS(alt);
- x = ary; lim = ary + n;
- bzero(cnt, sizeof(cnt));
- while (x < lim)
- {
- cnt[(x->key >> 24) & 255]++;
- *ptrs[(x->key >> 16) & 255]++ = *x;
- x++;
- }
-
- PTRS(ary);
- x = alt; lim = alt + n;
- while (x < lim)
- {
- *ptrs[(x->key >> 24) & 255]++ = *x;
- x++;
- }
-#undef PTRS
-}
-
-#include <emmintrin.h>
-
-static inline void sse_copy_elt(struct elt *to, struct elt *from)
-{
- __m128i m = _mm_load_si128((__m128i *) from);
- _mm_store_si128((__m128i *) to, m);
-}
-
-static void r1c_sse_sort(void)
-{
- uns cnt[256];
- struct elt *ptrs[256], *x, *lim;
-
- ASSERT(sizeof(struct elt) == 16);
- ASSERT(!((uintptr_t)alt & 15));
- ASSERT(!((uintptr_t)ary & 15));
-
- x = ary; lim = ary + n;
- bzero(cnt, sizeof(cnt));
- while (x < lim)
- cnt[x++->key & 255]++;
-
-#define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
-
- PTRS(alt);
- x = ary; lim = ary + n;
- bzero(cnt, sizeof(cnt));
- while (x < lim)
- {
- cnt[(x->key >> 8) & 255]++;
- sse_copy_elt(ptrs[x->key & 255]++, x);
- x++;
- }
-
- PTRS(ary);
- x = alt; lim = alt + n;
- bzero(cnt, sizeof(cnt));
- while (x < lim)
- {
- cnt[(x->key >> 16) & 255]++;
- sse_copy_elt(ptrs[(x->key >> 8) & 255]++, x);
- x++;
- }
-
- PTRS(alt);
- x = ary; lim = ary + n;
- bzero(cnt, sizeof(cnt));
- while (x < lim)
- {
- cnt[(x->key >> 24) & 255]++;
- sse_copy_elt(ptrs[(x->key >> 16) & 255]++, x);
- x++;
- }
-
- PTRS(ary);
- x = alt; lim = alt + n;
- while (x < lim)
- {
- sse_copy_elt(ptrs[(x->key >> 24) & 255]++, x);
- x++;
- }
-#undef PTRS
-}
-
-static void r1d_sort(void)
-{
- uns cnt[256];
- struct elt *ptrs[256], *x, *y, *lim;
-
- ASSERT(!(n % 4));
-
- x = ary; lim = ary + n;
- bzero(cnt, sizeof(cnt));
- while (x < lim)
- {
- cnt[x++->key & 255]++;
- cnt[x++->key & 255]++;
- cnt[x++->key & 255]++;
- cnt[x++->key & 255]++;
- }
-
-#define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
-
- PTRS(alt);
- x = ary; y = ary+n/2; lim = ary + n/2;
- bzero(cnt, sizeof(cnt));
- while (x < lim)
- {
- cnt[(x->key >> 8) & 255]++;
- cnt[(y->key >> 8) & 255]++;
- *ptrs[x->key & 255]++ = *x;
- *ptrs[y->key & 255]++ = *y;
- x++, y++;
- cnt[(x->key >> 8) & 255]++;
- cnt[(y->key >> 8) & 255]++;
- *ptrs[x->key & 255]++ = *x;
- *ptrs[y->key & 255]++ = *y;
- x++, y++;
- }
-
- PTRS(ary);
- x = alt; lim = alt + n;
- bzero(cnt, sizeof(cnt));
- while (x < lim)
- {
- cnt[(x->key >> 16) & 255]++;
- *ptrs[(x->key >> 8) & 255]++ = *x;
- x++;
- cnt[(x->key >> 16) & 255]++;
- *ptrs[(x->key >> 8) & 255]++ = *x;
- x++;
- }
-
- PTRS(alt);
- x = ary; lim = ary + n;
- bzero(cnt, sizeof(cnt));
- while (x < lim)
- {
- cnt[(x->key >> 24) & 255]++;
- *ptrs[(x->key >> 16) & 255]++ = *x;
- x++;
- cnt[(x->key >> 24) & 255]++;
- *ptrs[(x->key >> 16) & 255]++ = *x;
- x++;
- }
-
- PTRS(ary);
- x = alt; lim = alt + n;
- while (x < lim)
- {
- *ptrs[(x->key >> 24) & 255]++ = *x;
- x++;
- *ptrs[(x->key >> 24) & 255]++ = *x;
- x++;
- }
-#undef PTRS
-}
-
-static void r2_sort(void)
-{
- struct elt *from = ary, *to = alt;
-#define BITS 14
- uns cnt[1 << BITS];
- bzero(cnt, sizeof(cnt));
- for (uns i=0; i<n; i++)
- cnt[(from[i].key >> (32 - BITS)) & ((1 << BITS) - 1)]++;
- uns pos = 0;
- for (uns i=0; i<(1<<BITS); i++)
- {
- uns c = cnt[i];
- cnt[i] = pos;
- pos += c;
- }
- ASSERT(pos == n);
- for (uns i=0; i<n; i++)
- to[cnt[(from[i].key >> (32 - BITS)) & ((1 << BITS) - 1)]++] = from[i];
- ASSERT(cnt[(1 << BITS)-1] == n);
-
- pos = 0;
- for (uns i=0; i<(1 << BITS); i++)
- {
- as_sort(cnt[i] - pos, alt+pos);
- pos = cnt[i];
- }
- ary = alt;
-#undef BITS
-}
-
-static void r3_sort(void)
-{
-#define BITS 10
-#define LEVELS 2
-#define BUCKS (1 << BITS)
-#define THRESHOLD 5000
-#define ODDEVEN 0
-
- auto void r3(struct elt *from, struct elt *to, uns n, uns lev);
- void r3(struct elt *from, struct elt *to, uns n, uns lev)
- {
- uns sh = 32 - lev*BITS;
- uns cnt[BUCKS];
- bzero(cnt, sizeof(cnt));
- for (uns i=0; i<n; i++)
- cnt[(from[i].key >> sh) & (BUCKS - 1)]++;
- uns pos = 0;
- for (uns i=0; i<BUCKS; i++)
- {
- uns c = cnt[i];
- cnt[i] = pos;
- pos += c;
- }
- ASSERT(pos == n);
- for (uns i=0; i<n; i++)
-#if 1
- to[cnt[(from[i].key >> sh) & (BUCKS - 1)]++] = from[i];
-#else
- sse_copy_elt(&to[cnt[(from[i].key >> sh) & (BUCKS - 1)]++], &from[i]);
-#endif
- pos = 0;
- for (uns i=0; i<BUCKS; i++)
- {
- uns l = cnt[i]-pos;
- if (lev >= LEVELS || l <= THRESHOLD)
- {
- as_sort(l, to+pos);
- if ((lev % 2) != ODDEVEN)
- memcpy(from+pos, to+pos, l * sizeof(struct elt));
- }
- else
- r3(to+pos, from+pos, l, lev+1);
- pos = cnt[i];
- }
- }
-
- r3(ary, alt, n, 1);
- if (ODDEVEN)
- ary = alt;
-
-#undef ODDEVEN
-#undef THRESHOLD
-#undef BUCKS
-#undef LEVELS
-#undef BITS
-}
-
-static inline struct elt *mrg(struct elt *x, struct elt *xl, struct elt *y, struct elt *yl, struct elt *z)
-{
- for (;;)
- {
- if (x->key <= y->key)
- {
- *z++ = *x++;
- if (x >= xl)
- goto xend;
- }
- else
- {
- *z++ = *y++;
- if (y >= yl)
- goto yend;
- }
- }
-
- xend:
- while (y < yl)
- *z++ = *y++;
- return z;
-
- yend:
- while (x < xl)
- *z++ = *x++;
- return z;
-}
-
-static void mergesort(void)
-{
- struct elt *from, *to;
- uns lev = 0;
- if (1)
- {
- struct elt *x = ary, *z = alt, *last = ary + (n & ~1U);
- while (x < last)
- {
- if (x[0].key < x[1].key)
- *z++ = *x++, *z++ = *x++;
- else
- {
- *z++ = x[1];
- *z++ = x[0];
- x += 2;
- }
- }
- if (n % 2)
- *z = *x;
- lev++;
- }
- for (; (1U << lev) < n; lev++)
- {
- if (lev % 2)
- from = alt, to = ary;
- else
- from = ary, to = alt;
- struct elt *x, *z, *last;
- x = from;
- z = to;
- last = from + n;
- uns step = 1 << lev;
- while (x + 2*step <= last)
- {
- z = mrg(x, x+step, x+step, x+2*step, z);
- x += 2*step;
- }
- if (x + step < last)
- mrg(x, x+step, x+step, last, z);
- else
- memcpy(z, x, (byte*)last - (byte*)x);
- }
- if (lev % 2)
- ary = alt;
-}
-
-static void sampsort(uns n, struct elt *ar, struct elt *al, struct elt *dest, byte *wbuf)
-{
-#define WAYS 256
- struct elt k[WAYS];
- uns cnt[WAYS];
- bzero(cnt, sizeof(cnt));
- for (uns i=0; i<WAYS; i++)
- k[i] = ar[random() % n];
- as_sort(WAYS, k);
- for (uns i=0; i<n; i++)
- {
- uns w = 0;
-#define FW(delta) if (ar[i].key > k[w+delta].key) w += delta
- FW(128);
- FW(64);
- FW(32);
- FW(16);
- FW(8);
- FW(4);
- FW(2);
- FW(1);
- wbuf[i] = w;
- cnt[w]++;
- }
- struct elt *y = al, *way[WAYS], *z;
- for (uns i=0; i<WAYS; i++)
- {
- way[i] = y;
- y += cnt[i];
- }
- ASSERT(y == al+n);
- for (uns i=0; i<n; i++)
- {
- uns w = wbuf[i];
- *way[w]++ = ar[i];
- }
- y = al;
- z = ar;
- for (uns i=0; i<WAYS; i++)
- {
- if (cnt[i] >= 1000)
- sampsort(cnt[i], y, z, dest, wbuf);
- else
- {
- as_sort(cnt[i], y);
- if (al != dest)
- memcpy(z, y, cnt[i]*sizeof(struct elt));
- }
- y += cnt[i];
- z += cnt[i];
- }
-#undef FW
-#undef WAYS
-}
-
-static void samplesort(void)
-{
- byte *aux = xmalloc(n);
- sampsort(n, ary, alt, ary, aux);
- xfree(aux);
-}
-
-static void sampsort2(uns n, struct elt *ar, struct elt *al, struct elt *dest, byte *wbuf)
-{
-#define WAYS 256
- struct elt k[WAYS];
- uns cnt[WAYS];
- bzero(cnt, sizeof(cnt));
- for (uns i=0; i<WAYS; i++)
- k[i] = ar[random() % n];
- as_sort(WAYS, k);
- struct elt *k1 = ar, *k2 = ar+1, *kend = ar+n;
- byte *ww = wbuf;
- while (k2 < kend)
- {
- uns w1 = 0, w2 = 0;
-#define FW1(delta) if (k1->key > k[w1+delta].key) w1 += delta
-#define FW2(delta) if (k2->key > k[w2+delta].key) w2 += delta
- FW1(128); FW2(128);
- FW1(64); FW2(64);
- FW1(32); FW2(32);
- FW1(16); FW2(16);
- FW1(8); FW2(8);
- FW1(4); FW2(4);
- FW1(2); FW2(2);
- FW1(1); FW2(1);
- *ww++ = w1;
- *ww++ = w2;
- cnt[w1]++;
- cnt[w2]++;
- k1 += 2;
- k2 += 2;
- }
- if (k1 < kend)
- {
- uns w1 = 0;
- FW1(128); FW1(64); FW1(32); FW1(16);
- FW1(8); FW1(4); FW1(2); FW1(1);
- *ww++ = w1;
- cnt[w1]++;
- }
- struct elt *y = al, *way[WAYS], *z;
- for (uns i=0; i<WAYS; i++)
- {
- way[i] = y;
- y += cnt[i];
- }
- ASSERT(y == al+n);
- for (uns i=0; i<n; i++)
- {
- uns w = wbuf[i];
- *way[w]++ = ar[i];
- }
- y = al;
- z = ar;
- for (uns i=0; i<WAYS; i++)
- {
- if (cnt[i] >= 1000)
- sampsort2(cnt[i], y, z, dest, wbuf);
- else
- {
- as_sort(cnt[i], y);
- if (al != dest)
- memcpy(z, y, cnt[i]*sizeof(struct elt));
- }
- y += cnt[i];
- z += cnt[i];
- }
-#undef FW1
-#undef FW2
-#undef WAYS
-}
-
-static void samplesort2(void)
-{
- byte *aux = xmalloc(n);
- sampsort2(n, ary, alt, ary, aux);
- xfree(aux);
-}
-
-static void heapsort(void)
-{
-#define H_LESS(_a,_b) ((_a).key > (_b).key)
- struct elt *heap = ary-1;
- HEAP_INIT(struct elt, heap, n, H_LESS, HEAP_SWAP);
- uns nn = n;
- while (nn)
- HEAP_DELMIN(struct elt, heap, nn, H_LESS, HEAP_SWAP);
-#undef H_LESS
-}
-
-static void heapsort_ind(void)
-{
-#define H_LESS(_a,_b) ((_a)->key > (_b)->key)
- struct elt **heap = ind-1;
- HEAP_INIT(struct elt *, heap, n, H_LESS, HEAP_SWAP);
- uns nn = n;
- while (nn)
- HEAP_DELMIN(struct elt *, heap, nn, H_LESS, HEAP_SWAP);
-#undef H_LESS
-}
-
-static void mk_ary(void)
-{
- ary = array0;
- alt = array1;
- md5_context ctx;
- md5_init(&ctx);
- u32 block[16];
- bzero(block, sizeof(block));
-
- sum = 0;
- for (uns i=0; i<n; i++)
- {
-#if 1
- if (!(i % 4))
- {
- block[i%16] = i;
- md5_transform(ctx.buf, block);
- }
- ary[i].key = ctx.buf[i%4];
-#else
- ary[i].key = i*(~0U/(n-1));
-#endif
- for (uns j=1; j<sizeof(struct elt)/4; j++)
- ((u32*)&ary[i])[j] = ROL(ary[i].key, 3*j);
- sum ^= ary[i].key;
- }
-}
-
-static void chk_ary(void)
-{
- u32 s = ary[0].key;
- for (uns i=1; i<n; i++)
- if (ary[i].key < ary[i-1].key)
- die("Missorted at %d", i);
- else
- s ^= ary[i].key;
- if (s != sum)
- die("Corrupted");
-}
-
-static void mk_ind(void)
-{
- mk_ary();
- ind = xmalloc(sizeof(struct elt *) * n);
- for (uns i=0; i<n; i++)
- ind[i] = &ary[i];
-}
-
-static void chk_ind(void)
-{
- u32 s = ind[0]->key;
- for (uns i=1; i<n; i++)
- if (ind[i]->key < ind[i-1]->key)
- die("Missorted at %d", i);
- else
- s ^= ind[i]->key;
- if (s != sum)
- die("Corrupted");
- xfree(ind);
-}
-
-int main(int argc, char **argv)
-{
- log_init(argv[0]);
-
- int opt;
- uns op = 0;
- while ((opt = cf_getopt(argc, argv, CF_SHORT_OPTS "1", CF_NO_LONG_OPTS, NULL)) >= 0)
- switch (opt)
- {
- case '1':
- op |= (1 << (opt - '0'));
- break;
- default:
- die("usage?");
- }
-
- array0 = alloc_elts(n);
- array1 = alloc_elts(n);
- for (uns i=0; i<n; i++)
- array0[i] = array1[i] = (struct elt) { 0 };
-
- log(L_INFO, "Testing with %u elements", n);
-
- mk_ary();
- timestamp_t timer;
- init_timer(&timer);
- for (uns i=0; i<5; i++)
- {
-#if 1
- memcpy(alt, ary, sizeof(struct elt) * n);
- memcpy(ary, alt, sizeof(struct elt) * n);
-#else
- for (uns j=0; j<n; j++)
- alt[j] = ary[j];
- for (uns j=0; j<n; j++)
- ary[j] = alt[j];
-#endif
- }
- log(L_DEBUG, "memcpy: %d", get_timer(&timer)/10);
-
-#define BENCH(type, name, func) mk_##type(); init_timer(&timer); func; log(L_DEBUG, name ": %d", get_timer(&timer)); chk_##type()
-
- BENCH(ary, "qsort", qsort(ary, n, sizeof(struct elt), comp));
- BENCH(ary, "arraysort", as_sort(n, ary));
- BENCH(ind, "indirect qsort", qsort(ind, n, sizeof(struct elt *), comp_ind));
- BENCH(ind, "indirect arraysort", asi_sort(n));
- BENCH(ary, "radix1", r1_sort());
- BENCH(ary, "radix1b", r1b_sort());
- BENCH(ary, "radix1c", r1c_sort());
- BENCH(ary, "radix1c-sse", r1c_sse_sort());
- BENCH(ary, "radix1d", r1d_sort());
- BENCH(ary, "radix2", r2_sort());
- BENCH(ary, "radix3", r3_sort());
- BENCH(ary, "mergesort", mergesort());
- BENCH(ary, "samplesort", samplesort());
- BENCH(ary, "samplesort2", samplesort2());
- BENCH(ary, "heapsort", heapsort());
- BENCH(ind, "indirect heapsort", heapsort_ind());
-
- free_elts(array0, n);
- free_elts(array1, n);
- return 0;
-}
--- /dev/null
+How retros runs on different hardware:
+
+# 32-bit Athlon 64, gcc-4.1
+mj@albireo:~/src/sh/dev-sorter/run$ bin/retros
+D 2006-11-23 23:17:36 [retros] memcpy: 212
+D 2006-11-23 23:17:44 [retros] qsort: 6947
+D 2006-11-23 23:17:48 [retros] arraysort: 3183
+D 2006-11-23 23:18:02 [retros] indirect qsort: 13116
+D 2006-11-23 23:18:24 [retros] indirect arraysort: 19176
+D 2006-11-23 23:18:30 [retros] radix1: 3755
+D 2006-11-23 23:18:34 [retros] radix1b: 3100
+D 2006-11-23 23:18:39 [retros] radix1c: 2777
+D 2006-11-23 23:18:43 [retros] radix1c-sse: 2602
+D 2006-11-23 23:18:47 [retros] radix1d: 2728
+D 2006-11-23 23:18:53 [retros] radix2: 4249
+D 2006-11-23 23:18:57 [retros] radix3: 2577
+D 2006-11-23 23:19:09 [retros] mergesort: 10399
+D 2006-11-23 23:19:16 [retros] samplesort: 5698
+D 2006-11-23 23:19:23 [retros] samplesort2: 5016
+
+# 32-bit P4 Xeon, gcc-3.4
+sherlock@sherlock3:~/sherlock-mj/run$ bin/retros
+D 2006-11-23 23:23:52 [retros] memcpy: 198
+D 2006-11-23 23:24:23 [retros] qsort: 30114
+D 2006-11-23 23:24:27 [retros] arraysort: 2882
+D 2006-11-23 23:24:43 [retros] indirect qsort: 15019
+D 2006-11-23 23:24:59 [retros] indirect arraysort: 13267
+D 2006-11-23 23:25:03 [retros] radix1: 1881
+D 2006-11-23 23:25:06 [retros] radix1b: 1442
+D 2006-11-23 23:25:08 [retros] radix1c: 1313
+D 2006-11-23 23:25:10 [retros] radix1c-sse: 1229
+D 2006-11-23 23:25:13 [retros] radix1d: 1324
+D 2006-11-23 23:25:17 [retros] radix2: 2598
+D 2006-11-23 23:25:19 [retros] radix3: 1419
+D 2006-11-23 23:25:25 [retros] mergesort: 4929
+D 2006-11-23 23:25:29 [retros] samplesort: 2742
+D 2006-11-23 23:25:33 [retros] samplesort2: 2350
+
+# 64-bit P4 Xeon, gcc-3.4
+sherlock@sherlock4:~/sherlock-3.10/run$ bin/retros
+D 2006-11-23 23:44:31 [retros] memcpy: 132
+D 2006-11-23 23:44:58 [retros] qsort: 26469
+D 2006-11-23 23:45:01 [retros] arraysort: 2307
+D 2006-11-23 23:45:12 [retros] indirect qsort: 10971
+D 2006-11-23 23:45:24 [retros] indirect arraysort: 10350
+D 2006-11-23 23:45:26 [retros] radix1: 1099
+D 2006-11-23 23:45:27 [retros] radix1b: 1052
+D 2006-11-23 23:45:29 [retros] radix1c: 1017
+D 2006-11-23 23:45:30 [retros] radix1c-sse: 1017
+D 2006-11-23 23:45:32 [retros] radix1d: 1016
+D 2006-11-23 23:45:34 [retros] radix2: 1661
+D 2006-11-23 23:45:36 [retros] radix3: 955
+D 2006-11-23 23:45:39 [retros] mergesort: 3302
+D 2006-11-23 23:45:42 [retros] samplesort: 2376
+D 2006-11-23 23:45:45 [retros] samplesort2: 1870
+
+# 64-bit Turion X2 TL52, gcc-4.1.1
+pchar@paja ~/prog/sherlock-dev-sorter/run $ bin/retros
+D 2006-11-24 00:32:38 [retros] memcpy: 93
+D 2006-11-24 00:32:46 [retros] qsort: 7530
+D 2006-11-24 00:32:50 [retros] arraysort: 2766
+D 2006-11-24 00:33:01 [retros] indirect qsort: 10543
+D 2006-11-24 00:33:13 [retros] indirect arraysort: 10169
+D 2006-11-24 00:33:16 [retros] radix1: 1319
+D 2006-11-24 00:33:18 [retros] radix1b: 1126
+D 2006-11-24 00:33:20 [retros] radix1c: 1084
+D 2006-11-24 00:33:22 [retros] radix1c-sse: 1126
+D 2006-11-24 00:33:24 [retros] radix1d: 1091
+D 2006-11-24 00:33:27 [retros] radix2: 2238
+D 2006-11-24 00:33:29 [retros] radix3: 1183
+D 2006-11-24 00:33:34 [retros] mergesort: 4036
+D 2006-11-24 00:33:37 [retros] samplesort: 2594
+D 2006-11-24 00:33:40 [retros] samplesort2: 2214
--- /dev/null
+# Tests related to the new sorter
+
+DIRS+=debug/sorter
+PROGS+=$(addprefix $(o)/debug/sorter/,radix-tune-bits radix-tune-thresh)
+
+$(o)/debug/sorter/retros: $(o)/debug/sorter/retros.o $(LIBSH)
+$(o)/debug/sorter/radix-file-test: $(o)/debug/sorter/radix-file-test.o $(LIBSH)
+$(o)/debug/sorter/radix-asio-test: $(o)/debug/sorter/radix-asio-test.o $(LIBSH)
+$(o)/debug/sorter/radix-tune-bits: $(s)/debug/sorter/radix-tune-bits.sh
+$(o)/debug/sorter/radix-tune-thresh: $(s)/debug/sorter/radix-tune-thresh.sh
--- /dev/null
+/*
+ * An experiment with parallel reading and writing of files using ASIO.
+ *
+ * (c) 2007 Martin Mares <mj@ucw.cz>
+ */
+
+#include "ucw/lib.h"
+#include "ucw/conf.h"
+#include "ucw/lfs.h"
+#include "ucw/asio.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#define COPY
+#define DIRECT O_DIRECT
+
+static timestamp_t timer;
+
+#define P_INIT do { cnt = 0; cnt_rep = 0; cnt_ms = 1; } while(0)
+#define P_UPDATE(cc) do { \
+ cnt += cc; \
+ if (cnt >= cnt_rep) { cnt_ms += get_timer(&timer); \
+ printf("%d of %d MB (%.2f MB/sec)\r", (int)(cnt >> 20), (int)(total_size >> 20), (double)cnt / 1048576 * 1000 / cnt_ms); \
+ fflush(stdout); cnt_rep += 1<<26; } } while(0)
+#define P_FINAL do { \
+ cnt_ms += get_timer(&timer); \
+ msg(L_INFO, "Spent %.3f sec (%.2f MB/sec)", (double)cnt_ms/1000, (double)cnt / 1048576 * 1000 / cnt_ms); \
+} while(0)
+
+static struct asio_queue io_queue;
+
+int main(int argc, char **argv)
+{
+ uns files, bufsize;
+ u64 total_size;
+ if (argc != 4 ||
+ cf_parse_int(argv[1], (int*) &files) ||
+ cf_parse_int(argv[2], (int*) &bufsize) ||
+ cf_parse_u64(argv[3], &total_size))
+ {
+ fprintf(stderr, "Usage: asio-test <nr-files> <bufsize> <totalsize>\n");
+ return 1;
+ }
+ u64 cnt, cnt_rep;
+ uns cnt_ms;
+ int fd[files];
+ byte name[files][16];
+ struct asio_request *req[files];
+
+ init_timer(&timer);
+
+ io_queue.buffer_size = bufsize;
+ io_queue.max_writebacks = 2;
+ asio_init_queue(&io_queue);
+
+#ifdef COPY
+ msg(L_INFO, "Creating input file");
+ int in_fd = ucw_open("tmp/ft-in", O_RDWR | O_CREAT | O_TRUNC | DIRECT, 0666);
+ ASSERT(in_fd >= 0);
+ ASSERT(!(total_size % bufsize));
+ P_INIT;
+ for (uns i=0; i<total_size/bufsize; i++)
+ {
+ struct asio_request *r = asio_get(&io_queue);
+ r->op = ASIO_WRITE_BACK;
+ r->fd = in_fd;
+ r->len = bufsize;
+ byte *xbuf = r->buffer;
+ for (uns j=0; j<bufsize; j++)
+ xbuf[j] = i+j;
+ asio_submit(r);
+ P_UPDATE(bufsize);
+ }
+ asio_sync(&io_queue);
+ lseek(in_fd, 0, SEEK_SET);
+ sync();
+ P_FINAL;
+#endif
+
+ msg(L_INFO, "Initializing output files");
+ for (uns i=0; i<files; i++)
+ {
+ sprintf(name[i], "tmp/ft-%d", i);
+ fd[i] = ucw_open(name[i], O_RDWR | O_CREAT | O_TRUNC | DIRECT, 0666);
+ if (fd[i] < 0)
+ die("Cannot create %s: %m", name[i]);
+ }
+ sync();
+ get_timer(&timer);
+
+ msg(L_INFO, "Writing %d MB to %d files in parallel with %d byte buffers", (int)(total_size >> 20), files, bufsize);
+ P_INIT;
+ for (uns i=0; i<files; i++)
+ req[i] = asio_get(&io_queue);
+ for (uns round=0; round<total_size/bufsize/files; round++)
+ {
+ for (uns i=0; i<files; i++)
+ {
+ struct asio_request *r = req[i];
+#ifdef COPY
+ struct asio_request *rr, *rd = asio_get(&io_queue);
+ rd->op = ASIO_READ;
+ rd->fd = in_fd;
+ rd->len = bufsize;
+ asio_submit(rd);
+ rr = asio_wait(&io_queue);
+ ASSERT(rr == rd && rd->status == (int)rd->len);
+ memcpy(r->buffer, rd->buffer, bufsize);
+ asio_put(rr);
+#else
+ for (uns j=0; j<bufsize; j++)
+ r->buffer[j] = round+i+j;
+#endif
+ r->op = ASIO_WRITE_BACK;
+ r->fd = fd[i];
+ r->len = bufsize;
+ asio_submit(r);
+ P_UPDATE(bufsize);
+ req[i] = asio_get(&io_queue);
+ }
+ }
+ for (uns i=0; i<files; i++)
+ asio_put(req[i]);
+ asio_sync(&io_queue);
+#ifdef COPY
+ close(in_fd);
+#endif
+ msg(L_INFO, "Syncing");
+ sync();
+ P_FINAL;
+
+ msg(L_INFO, "Reading the files sequentially");
+ P_INIT;
+ for (uns i=0; i<files; i++)
+ {
+ lseek(fd[i], 0, SEEK_SET);
+ for (uns round=0; round<total_size/bufsize/files; round++)
+ {
+ struct asio_request *rr, *r = asio_get(&io_queue);
+ r->op = ASIO_READ;
+ r->fd = fd[i];
+ r->len = bufsize;
+ asio_submit(r);
+ rr = asio_wait(&io_queue);
+ ASSERT(rr == r && r->status == (int)bufsize);
+ asio_put(r);
+ P_UPDATE(bufsize);
+ }
+ close(fd[i]);
+ }
+ P_FINAL;
+
+ for (uns i=0; i<files; i++)
+ unlink(name[i]);
+#ifdef COPY
+ unlink("tmp/ft-in");
+#endif
+
+ asio_cleanup_queue(&io_queue);
+ msg(L_INFO, "Done");
+ return 0;
+}
--- /dev/null
+/*
+ * An experiment with parallel reading and writing of files.
+ *
+ * (c) 2007 Martin Mares <mj@ucw.cz>
+ */
+
+#include "ucw/lib.h"
+#include "ucw/conf.h"
+#include "ucw/lfs.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#define COPY
+#define DIRECT 0 // or O_DIRECT
+
+static timestamp_t timer;
+
+#define P_INIT do { cnt = 0; cnt_rep = 0; cnt_ms = 1; } while(0)
+#define P_UPDATE(cc) do { \
+ cnt += cc; \
+ if (cnt >= cnt_rep) { cnt_ms += get_timer(&timer); \
+ printf("%d of %d MB (%.2f MB/sec)\r", (int)(cnt >> 20), (int)(total_size >> 20), (double)cnt / 1048576 * 1000 / cnt_ms); \
+ fflush(stdout); cnt_rep += 1<<26; } } while(0)
+#define P_FINAL do { \
+ cnt_ms += get_timer(&timer); \
+ msg(L_INFO, "Spent %.3f sec (%.2f MB/sec)", (double)cnt_ms/1000, (double)cnt / 1048576 * 1000 / cnt_ms); \
+} while(0)
+
+int main(int argc, char **argv)
+{
+ uns files, bufsize;
+ u64 total_size;
+ if (argc != 4 ||
+ cf_parse_int(argv[1], (int*) &files) ||
+ cf_parse_int(argv[2], (int*) &bufsize) ||
+ cf_parse_u64(argv[3], &total_size))
+ {
+ fprintf(stderr, "Usage: file-test <nr-files> <bufsize> <totalsize>\n");
+ return 1;
+ }
+ u64 cnt, cnt_rep;
+ uns cnt_ms;
+ int fd[files];
+ byte *buf[files], name[files][16];
+ uns xbufsize = bufsize; // Used for single-file I/O
+ byte *xbuf = big_alloc(xbufsize);
+
+ init_timer(&timer);
+
+#ifdef COPY
+ msg(L_INFO, "Creating input file");
+ int in_fd = ucw_open("tmp/ft-in", O_RDWR | O_CREAT | O_TRUNC | DIRECT, 0666);
+ ASSERT(in_fd >= 0);
+ ASSERT(!(total_size % xbufsize));
+ P_INIT;
+ for (uns i=0; i<total_size/xbufsize; i++)
+ {
+ for (uns j=0; j<xbufsize; j++)
+ xbuf[j] = i+j;
+ uns c = write(in_fd, xbuf, xbufsize);
+ ASSERT(c == xbufsize);
+ P_UPDATE(c);
+ }
+ lseek(in_fd, 0, SEEK_SET);
+ sync();
+ P_FINAL;
+#endif
+
+ msg(L_INFO, "Initializing output files");
+ for (uns i=0; i<files; i++)
+ {
+ sprintf(name[i], "tmp/ft-%d", i);
+ fd[i] = ucw_open(name[i], O_RDWR | O_CREAT | O_TRUNC | DIRECT, 0666);
+ if (fd[i] < 0)
+ die("Cannot create %s: %m", name[i]);
+ buf[i] = big_alloc(bufsize);
+ }
+ sync();
+ get_timer(&timer);
+
+ msg(L_INFO, "Writing %d MB to %d files in parallel with %d byte buffers", (int)(total_size >> 20), files, bufsize);
+ P_INIT;
+ for (uns r=0; r<total_size/bufsize/files; r++)
+ {
+ for (uns i=0; i<files; i++)
+ {
+#ifdef COPY
+ uns ci = read(in_fd, buf[i], bufsize);
+ ASSERT(ci == bufsize);
+#else
+ for (uns j=0; j<bufsize; j++)
+ buf[i][j] = r+i+j;
+#endif
+ uns c = write(fd[i], buf[i], bufsize);
+ ASSERT(c == bufsize);
+ P_UPDATE(c);
+ }
+ }
+#ifdef COPY
+ close(in_fd);
+#endif
+ msg(L_INFO, "Syncing");
+ sync();
+ P_FINAL;
+
+ msg(L_INFO, "Reading the files sequentially");
+ P_INIT;
+ for (uns i=0; i<files; i++)
+ {
+ lseek(fd[i], 0, SEEK_SET);
+ for (uns r=0; r<total_size/xbufsize/files; r++)
+ {
+ uns c = read(fd[i], xbuf, xbufsize);
+ ASSERT(c == xbufsize);
+ P_UPDATE(c);
+ }
+ close(fd[i]);
+ }
+ P_FINAL;
+
+ for (uns i=0; i<files; i++)
+ unlink(name[i]);
+#ifdef COPY
+ unlink("tmp/ft-in");
+#endif
+ msg(L_INFO, "Done");
+ return 0;
+}
--- /dev/null
+#!/bin/bash
+# An utility for tuning the Sherlock's radix sorter
+# (c) 2007 Martin Mares <mj@ucw.cz>
+set -e
+UCW_PROGNAME="$0"
+. lib/libucw.sh
+
+# Path to Sherlock build directory
+[ -n "$BUILD" ] || BUILD=..
+[ -f "$BUILD/ucw/sorter/sorter.h" ] || die "BUILD does not point to Sherlock build directory"
+
+# Find out sort buffer size
+parse-config 'Sorter{##SortBuffer}'
+SORTBUF=$CF_Sorter_SortBuffer
+[ "$SORTBUF" -gt 0 ] || die "Unable to determine SortBuffer"
+log "Detected sort buffer size $SORTBUF"
+
+# Size of the test -- should be slightly less than a half of SortBuffer
+SIZE=$(($SORTBUF/2 - 8192))
+log "Decided to benchmark sorting of $SIZE byte data"
+
+# Which bit widths we try
+WIDTHS="0 6 7 8 9 10 11 12 13 14"
+
+# Which RadixThresholds we try
+THRS="2000 4000 10000 20000 50000"
+
+# Which sort-test tests we try
+TESTS="2,5,8,15"
+
+# Check various bit widths of the radix sorter
+rm -f tmp/radix-*
+for W in $WIDTHS ; do
+ rm -f $BUILD/obj/ucw/sorter/sort-test{,.o}
+ if [ $W = 0 ] ; then
+ log "Compiling with no radix splits"
+ ( cd $BUILD && make obj/ucw/sorter/sort-test )
+ OPT="-d32"
+ else
+ log "Compiling with $W-bit radix splits"
+ ( cd $BUILD && make CEXTRA="-DFORCE_RADIX_BITS=$W" obj/ucw/sorter/sort-test )
+ OPT=
+ fi
+ for THR in $THRS ; do
+ log "Testing with RadixThreshold=$THR"
+ $BUILD/obj/ucw/sorter/sort-test -SThreads.DefaultStackSize=2M -SSorter.RadixThreshold=$THR -s$SIZE -t$TESTS $OPT -v 2>&1 | tee -a tmp/radix-$W
+ done
+done
+
+echo "thresh" >tmp/radix-thrs
+echo "test#" >tmp/radix-tests
+for THR in $THRS ; do
+ for TEST in `echo $TESTS | tr ',' ' '` ; do
+ echo $THR >>tmp/radix-thrs
+ echo $TEST >>tmp/radix-tests
+ done
+done
+
+FILES="tmp/radix-thrs tmp/radix-tests"
+for W in $WIDTHS ; do
+ a=tmp/radix-$W
+ echo >$a.out "$W bits"
+ sed 's/.* \([0-9.]\+\)s internal sorting.*/\1/;t;d' <$a >>$a.out
+ FILES="$FILES $a.out"
+done
+
+log "These are the results:"
+paste $FILES
--- /dev/null
+#!/bin/bash
+# An utility for tuning the Sherlock's radix sorter threshold
+# (c) 2007 Martin Mares <mj@ucw.cz>
+set -e
+UCW_PROGNAME="$0"
+. lib/libucw.sh
+
+# Path to Sherlock build directory
+[ -n "$BUILD" ] || BUILD=..
+[ -f "$BUILD/ucw/sorter/sorter.h" ] || die "BUILD does not point to Sherlock build directory"
+
+# Find out sort buffer size
+parse-config 'Sorter{##SortBuffer}'
+SORTBUF=$CF_Sorter_SortBuffer
+[ "$SORTBUF" -gt 0 ] || die "Unable to determine SortBuffer"
+log "Detected sort buffer size $SORTBUF"
+
+# Find out radix-sorter width
+[ -f "$BUILD/obj/config.mk" ] || die "Sherlock source not configured"
+WIDTH=`sed <$BUILD/obj/config.mk 's/^CONFIG_UCW_RADIX_SORTER_BITS=\(.*\)/\1/;t;d'`
+[ -n "$WIDTH" ] || die "CONFIG_UCW_RADIX_SORTER_BITS not set (!?)"
+log "Detected radix-sorter width $WIDTH"
+
+# Maximum size of the test -- should be slightly less than a half of SortBuffer
+SIZE=$(($SORTBUF/2 - 8192))
+
+# Which sort-test test we try
+TEST="2"
+
+# Which thresholds we try
+THRS="16"
+T=$SIZE
+while [ $T -gt 100 ] ; do
+ THRS="$THRS $T"
+ T=$(($T/2))
+done
+
+if true ; then
+
+rm -f tmp/radix-*
+echo "sizes" >tmp/radix-sizes
+while [ $SIZE -gt 262144 ] ; do
+ echo $SIZE >>tmp/radix-sizes
+ for T in $THRS ; do
+ log "Trying size $SIZE with threshold $T"
+ $BUILD/obj/ucw/sorter/sort-test -SSorter.RadixThreshold=$T -s$SIZE -t$TEST -v 2>&1 | tee -a tmp/radix-$T
+ done
+ SIZE=$(($SIZE/2))
+done
+
+fi
+
+FILES=tmp/radix-sizes
+for T in $THRS ; do
+ a=tmp/radix-$T
+ echo >$a.out $T
+ sed 's/.* \([0-9.]\+\)s internal sorting.*/\1/;t;d' <$a >>$a.out
+ FILES="$FILES $a.out"
+done
+
+log "These are the results:"
+paste $FILES
--- /dev/null
+/*
+ * Experiments with various sorting algorithms
+ *
+ * (c) 2007--2008 Martin Mares <mj@ucw.cz>
+ */
+
+#include "sherlock/sherlock.h"
+#include "ucw/getopt.h"
+#include "ucw/md5.h"
+#include "ucw/heap.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/user.h>
+
+struct elt {
+ u32 key;
+ u32 ballast[3];
+};
+
+static struct elt *ary, *alt, **ind, *array0, *array1;
+static uns n = 10000000;
+static u32 sum;
+
+static struct elt *alloc_elts(uns n)
+{
+ return big_alloc(n * sizeof(struct elt));
+}
+
+static void free_elts(struct elt *a, uns n)
+{
+ big_free(a, n * sizeof(struct elt));
+}
+
+static int comp(const void *x, const void *y)
+{
+ const struct elt *xx = x, *yy = y;
+ return (xx->key < yy->key) ? -1 : (xx->key > yy->key) ? 1 : 0;
+}
+
+static int comp_ind(const void *x, const void *y)
+{
+ const struct elt * const *xx = x, * const *yy = y;
+ return comp(*xx, *yy);
+}
+
+#define ASORT_PREFIX(x) as_##x
+#define ASORT_KEY_TYPE u32
+#define ASORT_ELT(i) a[i].key
+#define ASORT_SWAP(i,j) do { struct elt t=a[i]; a[i]=a[j]; a[j]=t; } while (0)
+#define ASORT_EXTRA_ARGS , struct elt *a
+#include "ucw/sorter/array-simple.h"
+
+#define ASORT_PREFIX(x) asi_##x
+#define ASORT_KEY_TYPE u32
+#define ASORT_ELT(i) ind[i]->key
+#define ASORT_SWAP(i,j) do { struct elt *t=ind[i]; ind[i]=ind[j]; ind[j]=t; } while (0)
+#include "ucw/sorter/array-simple.h"
+
+static void r1_sort(void)
+{
+ struct elt *from = ary, *to = alt, *tmp;
+#define BITS 8
+ uns cnt[1 << BITS];
+ for (uns sh=0; sh<32; sh+=BITS)
+ {
+ bzero(cnt, sizeof(cnt));
+ for (uns i=0; i<n; i++)
+ cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++;
+ uns pos = 0;
+ for (uns i=0; i<(1<<BITS); i++)
+ {
+ uns c = cnt[i];
+ cnt[i] = pos;
+ pos += c;
+ }
+ ASSERT(pos == n);
+ for (uns i=0; i<n; i++)
+ to[cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++] = from[i];
+ ASSERT(cnt[(1 << BITS)-1] == n);
+ tmp=from, from=to, to=tmp;
+ }
+ ary = from;
+#undef BITS
+}
+
+static void r1b_sort(void)
+{
+ struct elt *from = ary, *to = alt, *tmp;
+#define BITS 8
+ uns cnt[1 << BITS], cnt2[1 << BITS];
+ for (uns sh=0; sh<32; sh+=BITS)
+ {
+ if (sh)
+ memcpy(cnt, cnt2, sizeof(cnt));
+ else
+ {
+ bzero(cnt, sizeof(cnt));
+ for (uns i=0; i<n; i++)
+ cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++;
+ }
+ uns pos = 0;
+ for (uns i=0; i<(1<<BITS); i++)
+ {
+ uns c = cnt[i];
+ cnt[i] = pos;
+ pos += c;
+ }
+ ASSERT(pos == n);
+ bzero(cnt2, sizeof(cnt2));
+ for (uns i=0; i<n; i++)
+ {
+ cnt2[(from[i].key >> (sh + BITS)) & ((1 << BITS) - 1)]++;
+ to[cnt[(from[i].key >> sh) & ((1 << BITS) - 1)]++] = from[i];
+ }
+ ASSERT(cnt[(1 << BITS)-1] == n);
+ tmp=from, from=to, to=tmp;
+ }
+ ary = from;
+#undef BITS
+}
+
+static void r1c_sort(void)
+{
+ uns cnt[256];
+ struct elt *ptrs[256], *x, *lim;
+
+ x = ary; lim = ary + n;
+ bzero(cnt, sizeof(cnt));
+ while (x < lim)
+ cnt[x++->key & 255]++;
+
+#define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
+
+ PTRS(alt);
+ x = ary; lim = ary + n;
+ bzero(cnt, sizeof(cnt));
+ while (x < lim)
+ {
+ cnt[(x->key >> 8) & 255]++;
+ *ptrs[x->key & 255]++ = *x;
+ x++;
+ }
+
+ PTRS(ary);
+ x = alt; lim = alt + n;
+ bzero(cnt, sizeof(cnt));
+ while (x < lim)
+ {
+ cnt[(x->key >> 16) & 255]++;
+ *ptrs[(x->key >> 8) & 255]++ = *x;
+ x++;
+ }
+
+ PTRS(alt);
+ x = ary; lim = ary + n;
+ bzero(cnt, sizeof(cnt));
+ while (x < lim)
+ {
+ cnt[(x->key >> 24) & 255]++;
+ *ptrs[(x->key >> 16) & 255]++ = *x;
+ x++;
+ }
+
+ PTRS(ary);
+ x = alt; lim = alt + n;
+ while (x < lim)
+ {
+ *ptrs[(x->key >> 24) & 255]++ = *x;
+ x++;
+ }
+#undef PTRS
+}
+
+#include <emmintrin.h>
+
+static inline void sse_copy_elt(struct elt *to, struct elt *from)
+{
+ __m128i m = _mm_load_si128((__m128i *) from);
+ _mm_store_si128((__m128i *) to, m);
+}
+
+static void r1c_sse_sort(void)
+{
+ uns cnt[256];
+ struct elt *ptrs[256], *x, *lim;
+
+ ASSERT(sizeof(struct elt) == 16);
+ ASSERT(!((uintptr_t)alt & 15));
+ ASSERT(!((uintptr_t)ary & 15));
+
+ x = ary; lim = ary + n;
+ bzero(cnt, sizeof(cnt));
+ while (x < lim)
+ cnt[x++->key & 255]++;
+
+#define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
+
+ PTRS(alt);
+ x = ary; lim = ary + n;
+ bzero(cnt, sizeof(cnt));
+ while (x < lim)
+ {
+ cnt[(x->key >> 8) & 255]++;
+ sse_copy_elt(ptrs[x->key & 255]++, x);
+ x++;
+ }
+
+ PTRS(ary);
+ x = alt; lim = alt + n;
+ bzero(cnt, sizeof(cnt));
+ while (x < lim)
+ {
+ cnt[(x->key >> 16) & 255]++;
+ sse_copy_elt(ptrs[(x->key >> 8) & 255]++, x);
+ x++;
+ }
+
+ PTRS(alt);
+ x = ary; lim = ary + n;
+ bzero(cnt, sizeof(cnt));
+ while (x < lim)
+ {
+ cnt[(x->key >> 24) & 255]++;
+ sse_copy_elt(ptrs[(x->key >> 16) & 255]++, x);
+ x++;
+ }
+
+ PTRS(ary);
+ x = alt; lim = alt + n;
+ while (x < lim)
+ {
+ sse_copy_elt(ptrs[(x->key >> 24) & 255]++, x);
+ x++;
+ }
+#undef PTRS
+}
+
+static void r1d_sort(void)
+{
+ uns cnt[256];
+ struct elt *ptrs[256], *x, *y, *lim;
+
+ ASSERT(!(n % 4));
+
+ x = ary; lim = ary + n;
+ bzero(cnt, sizeof(cnt));
+ while (x < lim)
+ {
+ cnt[x++->key & 255]++;
+ cnt[x++->key & 255]++;
+ cnt[x++->key & 255]++;
+ cnt[x++->key & 255]++;
+ }
+
+#define PTRS(start) x=start; for (uns i=0; i<256; i++) { ptrs[i]=x; x+=cnt[i]; }
+
+ PTRS(alt);
+ x = ary; y = ary+n/2; lim = ary + n/2;
+ bzero(cnt, sizeof(cnt));
+ while (x < lim)
+ {
+ cnt[(x->key >> 8) & 255]++;
+ cnt[(y->key >> 8) & 255]++;
+ *ptrs[x->key & 255]++ = *x;
+ *ptrs[y->key & 255]++ = *y;
+ x++, y++;
+ cnt[(x->key >> 8) & 255]++;
+ cnt[(y->key >> 8) & 255]++;
+ *ptrs[x->key & 255]++ = *x;
+ *ptrs[y->key & 255]++ = *y;
+ x++, y++;
+ }
+
+ PTRS(ary);
+ x = alt; lim = alt + n;
+ bzero(cnt, sizeof(cnt));
+ while (x < lim)
+ {
+ cnt[(x->key >> 16) & 255]++;
+ *ptrs[(x->key >> 8) & 255]++ = *x;
+ x++;
+ cnt[(x->key >> 16) & 255]++;
+ *ptrs[(x->key >> 8) & 255]++ = *x;
+ x++;
+ }
+
+ PTRS(alt);
+ x = ary; lim = ary + n;
+ bzero(cnt, sizeof(cnt));
+ while (x < lim)
+ {
+ cnt[(x->key >> 24) & 255]++;
+ *ptrs[(x->key >> 16) & 255]++ = *x;
+ x++;
+ cnt[(x->key >> 24) & 255]++;
+ *ptrs[(x->key >> 16) & 255]++ = *x;
+ x++;
+ }
+
+ PTRS(ary);
+ x = alt; lim = alt + n;
+ while (x < lim)
+ {
+ *ptrs[(x->key >> 24) & 255]++ = *x;
+ x++;
+ *ptrs[(x->key >> 24) & 255]++ = *x;
+ x++;
+ }
+#undef PTRS
+}
+
+static void r2_sort(void)
+{
+ struct elt *from = ary, *to = alt;
+#define BITS 14
+ uns cnt[1 << BITS];
+ bzero(cnt, sizeof(cnt));
+ for (uns i=0; i<n; i++)
+ cnt[(from[i].key >> (32 - BITS)) & ((1 << BITS) - 1)]++;
+ uns pos = 0;
+ for (uns i=0; i<(1<<BITS); i++)
+ {
+ uns c = cnt[i];
+ cnt[i] = pos;
+ pos += c;
+ }
+ ASSERT(pos == n);
+ for (uns i=0; i<n; i++)
+ to[cnt[(from[i].key >> (32 - BITS)) & ((1 << BITS) - 1)]++] = from[i];
+ ASSERT(cnt[(1 << BITS)-1] == n);
+
+ pos = 0;
+ for (uns i=0; i<(1 << BITS); i++)
+ {
+ as_sort(cnt[i] - pos, alt+pos);
+ pos = cnt[i];
+ }
+ ary = alt;
+#undef BITS
+}
+
+static void r3_sort(void)
+{
+#define BITS 10
+#define LEVELS 2
+#define BUCKS (1 << BITS)
+#define THRESHOLD 5000
+#define ODDEVEN 0
+
+ auto void r3(struct elt *from, struct elt *to, uns n, uns lev);
+ void r3(struct elt *from, struct elt *to, uns n, uns lev)
+ {
+ uns sh = 32 - lev*BITS;
+ uns cnt[BUCKS];
+ bzero(cnt, sizeof(cnt));
+ for (uns i=0; i<n; i++)
+ cnt[(from[i].key >> sh) & (BUCKS - 1)]++;
+ uns pos = 0;
+ for (uns i=0; i<BUCKS; i++)
+ {
+ uns c = cnt[i];
+ cnt[i] = pos;
+ pos += c;
+ }
+ ASSERT(pos == n);
+ for (uns i=0; i<n; i++)
+#if 1
+ to[cnt[(from[i].key >> sh) & (BUCKS - 1)]++] = from[i];
+#else
+ sse_copy_elt(&to[cnt[(from[i].key >> sh) & (BUCKS - 1)]++], &from[i]);
+#endif
+ pos = 0;
+ for (uns i=0; i<BUCKS; i++)
+ {
+ uns l = cnt[i]-pos;
+ if (lev >= LEVELS || l <= THRESHOLD)
+ {
+ as_sort(l, to+pos);
+ if ((lev % 2) != ODDEVEN)
+ memcpy(from+pos, to+pos, l * sizeof(struct elt));
+ }
+ else
+ r3(to+pos, from+pos, l, lev+1);
+ pos = cnt[i];
+ }
+ }
+
+ r3(ary, alt, n, 1);
+ if (ODDEVEN)
+ ary = alt;
+
+#undef ODDEVEN
+#undef THRESHOLD
+#undef BUCKS
+#undef LEVELS
+#undef BITS
+}
+
+static inline struct elt *mrg(struct elt *x, struct elt *xl, struct elt *y, struct elt *yl, struct elt *z)
+{
+ for (;;)
+ {
+ if (x->key <= y->key)
+ {
+ *z++ = *x++;
+ if (x >= xl)
+ goto xend;
+ }
+ else
+ {
+ *z++ = *y++;
+ if (y >= yl)
+ goto yend;
+ }
+ }
+
+ xend:
+ while (y < yl)
+ *z++ = *y++;
+ return z;
+
+ yend:
+ while (x < xl)
+ *z++ = *x++;
+ return z;
+}
+
+static void mergesort(void)
+{
+ struct elt *from, *to;
+ uns lev = 0;
+ if (1)
+ {
+ struct elt *x = ary, *z = alt, *last = ary + (n & ~1U);
+ while (x < last)
+ {
+ if (x[0].key < x[1].key)
+ *z++ = *x++, *z++ = *x++;
+ else
+ {
+ *z++ = x[1];
+ *z++ = x[0];
+ x += 2;
+ }
+ }
+ if (n % 2)
+ *z = *x;
+ lev++;
+ }
+ for (; (1U << lev) < n; lev++)
+ {
+ if (lev % 2)
+ from = alt, to = ary;
+ else
+ from = ary, to = alt;
+ struct elt *x, *z, *last;
+ x = from;
+ z = to;
+ last = from + n;
+ uns step = 1 << lev;
+ while (x + 2*step <= last)
+ {
+ z = mrg(x, x+step, x+step, x+2*step, z);
+ x += 2*step;
+ }
+ if (x + step < last)
+ mrg(x, x+step, x+step, last, z);
+ else
+ memcpy(z, x, (byte*)last - (byte*)x);
+ }
+ if (lev % 2)
+ ary = alt;
+}
+
+static void sampsort(uns n, struct elt *ar, struct elt *al, struct elt *dest, byte *wbuf)
+{
+#define WAYS 256
+ struct elt k[WAYS];
+ uns cnt[WAYS];
+ bzero(cnt, sizeof(cnt));
+ for (uns i=0; i<WAYS; i++)
+ k[i] = ar[random() % n];
+ as_sort(WAYS, k);
+ for (uns i=0; i<n; i++)
+ {
+ uns w = 0;
+#define FW(delta) if (ar[i].key > k[w+delta].key) w += delta
+ FW(128);
+ FW(64);
+ FW(32);
+ FW(16);
+ FW(8);
+ FW(4);
+ FW(2);
+ FW(1);
+ wbuf[i] = w;
+ cnt[w]++;
+ }
+ struct elt *y = al, *way[WAYS], *z;
+ for (uns i=0; i<WAYS; i++)
+ {
+ way[i] = y;
+ y += cnt[i];
+ }
+ ASSERT(y == al+n);
+ for (uns i=0; i<n; i++)
+ {
+ uns w = wbuf[i];
+ *way[w]++ = ar[i];
+ }
+ y = al;
+ z = ar;
+ for (uns i=0; i<WAYS; i++)
+ {
+ if (cnt[i] >= 1000)
+ sampsort(cnt[i], y, z, dest, wbuf);
+ else
+ {
+ as_sort(cnt[i], y);
+ if (al != dest)
+ memcpy(z, y, cnt[i]*sizeof(struct elt));
+ }
+ y += cnt[i];
+ z += cnt[i];
+ }
+#undef FW
+#undef WAYS
+}
+
+static void samplesort(void)
+{
+ byte *aux = xmalloc(n);
+ sampsort(n, ary, alt, ary, aux);
+ xfree(aux);
+}
+
+static void sampsort2(uns n, struct elt *ar, struct elt *al, struct elt *dest, byte *wbuf)
+{
+#define WAYS 256
+ struct elt k[WAYS];
+ uns cnt[WAYS];
+ bzero(cnt, sizeof(cnt));
+ for (uns i=0; i<WAYS; i++)
+ k[i] = ar[random() % n];
+ as_sort(WAYS, k);
+ struct elt *k1 = ar, *k2 = ar+1, *kend = ar+n;
+ byte *ww = wbuf;
+ while (k2 < kend)
+ {
+ uns w1 = 0, w2 = 0;
+#define FW1(delta) if (k1->key > k[w1+delta].key) w1 += delta
+#define FW2(delta) if (k2->key > k[w2+delta].key) w2 += delta
+ FW1(128); FW2(128);
+ FW1(64); FW2(64);
+ FW1(32); FW2(32);
+ FW1(16); FW2(16);
+ FW1(8); FW2(8);
+ FW1(4); FW2(4);
+ FW1(2); FW2(2);
+ FW1(1); FW2(1);
+ *ww++ = w1;
+ *ww++ = w2;
+ cnt[w1]++;
+ cnt[w2]++;
+ k1 += 2;
+ k2 += 2;
+ }
+ if (k1 < kend)
+ {
+ uns w1 = 0;
+ FW1(128); FW1(64); FW1(32); FW1(16);
+ FW1(8); FW1(4); FW1(2); FW1(1);
+ *ww++ = w1;
+ cnt[w1]++;
+ }
+ struct elt *y = al, *way[WAYS], *z;
+ for (uns i=0; i<WAYS; i++)
+ {
+ way[i] = y;
+ y += cnt[i];
+ }
+ ASSERT(y == al+n);
+ for (uns i=0; i<n; i++)
+ {
+ uns w = wbuf[i];
+ *way[w]++ = ar[i];
+ }
+ y = al;
+ z = ar;
+ for (uns i=0; i<WAYS; i++)
+ {
+ if (cnt[i] >= 1000)
+ sampsort2(cnt[i], y, z, dest, wbuf);
+ else
+ {
+ as_sort(cnt[i], y);
+ if (al != dest)
+ memcpy(z, y, cnt[i]*sizeof(struct elt));
+ }
+ y += cnt[i];
+ z += cnt[i];
+ }
+#undef FW1
+#undef FW2
+#undef WAYS
+}
+
+static void samplesort2(void)
+{
+ byte *aux = xmalloc(n);
+ sampsort2(n, ary, alt, ary, aux);
+ xfree(aux);
+}
+
+static void heapsort(void)
+{
+#define H_LESS(_a,_b) ((_a).key > (_b).key)
+ struct elt *heap = ary-1;
+ HEAP_INIT(struct elt, heap, n, H_LESS, HEAP_SWAP);
+ uns nn = n;
+ while (nn)
+ HEAP_DELMIN(struct elt, heap, nn, H_LESS, HEAP_SWAP);
+#undef H_LESS
+}
+
+static void heapsort_ind(void)
+{
+#define H_LESS(_a,_b) ((_a)->key > (_b)->key)
+ struct elt **heap = ind-1;
+ HEAP_INIT(struct elt *, heap, n, H_LESS, HEAP_SWAP);
+ uns nn = n;
+ while (nn)
+ HEAP_DELMIN(struct elt *, heap, nn, H_LESS, HEAP_SWAP);
+#undef H_LESS
+}
+
+static void mk_ary(void)
+{
+ ary = array0;
+ alt = array1;
+ md5_context ctx;
+ md5_init(&ctx);
+ u32 block[16];
+ bzero(block, sizeof(block));
+
+ sum = 0;
+ for (uns i=0; i<n; i++)
+ {
+#if 1
+ if (!(i % 4))
+ {
+ block[i%16] = i;
+ md5_transform(ctx.buf, block);
+ }
+ ary[i].key = ctx.buf[i%4];
+#else
+ ary[i].key = i*(~0U/(n-1));
+#endif
+ for (uns j=1; j<sizeof(struct elt)/4; j++)
+ ((u32*)&ary[i])[j] = ROL(ary[i].key, 3*j);
+ sum ^= ary[i].key;
+ }
+}
+
+static void chk_ary(void)
+{
+ u32 s = ary[0].key;
+ for (uns i=1; i<n; i++)
+ if (ary[i].key < ary[i-1].key)
+ die("Missorted at %d", i);
+ else
+ s ^= ary[i].key;
+ if (s != sum)
+ die("Corrupted");
+}
+
+static void mk_ind(void)
+{
+ mk_ary();
+ ind = xmalloc(sizeof(struct elt *) * n);
+ for (uns i=0; i<n; i++)
+ ind[i] = &ary[i];
+}
+
+static void chk_ind(void)
+{
+ u32 s = ind[0]->key;
+ for (uns i=1; i<n; i++)
+ if (ind[i]->key < ind[i-1]->key)
+ die("Missorted at %d", i);
+ else
+ s ^= ind[i]->key;
+ if (s != sum)
+ die("Corrupted");
+ xfree(ind);
+}
+
+int main(int argc, char **argv)
+{
+ log_init(argv[0]);
+
+ int opt;
+ uns op = 0;
+ while ((opt = cf_getopt(argc, argv, CF_SHORT_OPTS "1", CF_NO_LONG_OPTS, NULL)) >= 0)
+ switch (opt)
+ {
+ case '1':
+ op |= (1 << (opt - '0'));
+ break;
+ default:
+ die("usage?");
+ }
+
+ array0 = alloc_elts(n);
+ array1 = alloc_elts(n);
+ for (uns i=0; i<n; i++)
+ array0[i] = array1[i] = (struct elt) { 0 };
+
+ log(L_INFO, "Testing with %u elements", n);
+
+ mk_ary();
+ timestamp_t timer;
+ init_timer(&timer);
+ for (uns i=0; i<5; i++)
+ {
+#if 1
+ memcpy(alt, ary, sizeof(struct elt) * n);
+ memcpy(ary, alt, sizeof(struct elt) * n);
+#else
+ for (uns j=0; j<n; j++)
+ alt[j] = ary[j];
+ for (uns j=0; j<n; j++)
+ ary[j] = alt[j];
+#endif
+ }
+ log(L_DEBUG, "memcpy: %d", get_timer(&timer)/10);
+
+#define BENCH(type, name, func) mk_##type(); init_timer(&timer); func; log(L_DEBUG, name ": %d", get_timer(&timer)); chk_##type()
+
+ BENCH(ary, "qsort", qsort(ary, n, sizeof(struct elt), comp));
+ BENCH(ary, "arraysort", as_sort(n, ary));
+ BENCH(ind, "indirect qsort", qsort(ind, n, sizeof(struct elt *), comp_ind));
+ BENCH(ind, "indirect arraysort", asi_sort(n));
+ BENCH(ary, "radix1", r1_sort());
+ BENCH(ary, "radix1b", r1b_sort());
+ BENCH(ary, "radix1c", r1c_sort());
+ BENCH(ary, "radix1c-sse", r1c_sse_sort());
+ BENCH(ary, "radix1d", r1d_sort());
+ BENCH(ary, "radix2", r2_sort());
+ BENCH(ary, "radix3", r3_sort());
+ BENCH(ary, "mergesort", mergesort());
+ BENCH(ary, "samplesort", samplesort());
+ BENCH(ary, "samplesort2", samplesort2());
+ BENCH(ary, "heapsort", heapsort());
+ BENCH(ind, "indirect heapsort", heapsort_ind());
+
+ free_elts(array0, n);
+ free_elts(array1, n);
+ return 0;
+}