+ brewind(fb_signatures);
+ bputl(fb_signatures, count);
+ DBG("%d signatures written", count);
+
+ imo_decompress_thumbnails_done();
+ buck2obj_free(bob);
+ mp_delete(pool);
+ bclose(fb_cards);
+ bclose(fb_card_attrs);
+ bclose(fb_signatures);
+}
+
+/*********************************************************************************/
+
+struct vectors_node {
+ oid_t oid;
+ u32 temp;
+ struct image_vector vec;
+};
+
+static uns vectors_count;
+static struct vectors_node *vectors;
+
+static void
+vectors_read(void)
+{
+ log(L_DEBUG, "Reading signature vectors");
+ struct fastbuf *fb = index_bopen("image-sig", O_RDONLY);
+ vectors_count = bgetl(fb);
+ if (vectors_count)
+ {
+ vectors = xmalloc(vectors_count * sizeof(struct vectors_node));
+ for (uns i = 0; i < vectors_count; i++)
+ {
+ bread(fb, &vectors[i].oid, sizeof(oid_t));
+ bread(fb, &vectors[i].vec, sizeof(struct image_vector));
+ bskip(fb, bgetc(fb) * sizeof(struct image_region));
+ }
+ }
+ bclose(fb);
+}
+
+static void
+vectors_cleanup(void)
+{
+ log(L_DEBUG, "Freeing signature vectors");
+ if (vectors_count)
+ xfree(vectors);
+}
+
+/*********************************************************************************/
+
+static u64 random_clusters_max_size = 500000;
+static uns random_clusters_max_count = 1000;
+
+#define RANDOM_CLUSTERS_SIZE 0x7fffffff
+#define RANDOM_CLUSTERS_LAST 0x80000000
+
+static struct random_clusters_node {
+ struct vectors_node *node;
+ s32 dot_prod;
+} *random_clusters_temp;
+static uns random_clusters_count;
+
+#define ASORT_PREFIX(x) random_clusters_##x
+#define ASORT_KEY_TYPE s32
+#define ASORT_ELT(i) start[i].dot_prod
+#define ASORT_SWAP(i,j) do { struct random_clusters_node _s = start[i]; start[i] = start[j]; start[j] = _s; } while(0)
+#define ASORT_EXTRA_ARGS , struct random_clusters_node *start
+#include "lib/arraysort.h"
+
+static void
+random_clusters_init(void)
+{
+ if (!vectors_count)
+ return;
+ log(L_INFO, "Initializing random clusters generator");
+ random_clusters_temp = xmalloc(vectors_count * sizeof(struct random_clusters_node));
+ for (uns i = 0; i < vectors_count; i++)
+ random_clusters_temp[i].node = vectors + i;
+}
+
+static void
+random_clusters_build(void)
+{
+ random_clusters_count = 0;
+ if (!vectors_count)
+ return;
+
+ log(L_INFO, "Generating random clusters for duplicates comparision");
+
+ for (uns i = 0; i < vectors_count; i++)
+ vectors[i].temp &= RANDOM_CLUSTERS_SIZE;
+
+ /* Initialize recursion */
+ struct stk {
+ uns count;
+ struct random_clusters_node *start;
+ } stk_top[64], *stk = stk_top + 1;
+ stk->start = random_clusters_temp;
+ stk->count = vectors_count;
+
+ /* Main loop */
+ while (stk != stk_top)
+ {
+ /* Split conditions */
+ uns split;
+ if (stk->count < 2)
+ split = 0;
+ else if (stk->count > random_clusters_max_count)
+ split = 1;
+ else
+ {
+ s64 size = random_clusters_max_size;
+ for (uns i = 0; i < stk->count && size >= 0; i++)
+ size -= stk->start[i].node->temp;
+ split = size < 0;
+ }
+
+ /* BSP leaf node */
+ if (!split)
+ {
+ stk->start[stk->count - 1].node->temp |= RANDOM_CLUSTERS_LAST;
+ random_clusters_count++;
+ stk--;
+ }
+
+ /* BSP internal node */
+ else
+ {
+ /* Generate random normal vector of the splitting plane */
+ int normal[IMAGE_VEC_K];
+ for (uns i = 0; i < IMAGE_VEC_K; i++)
+ normal[i] = random_max(0x20001) - 0x10000;
+
+ /* Compute dot produts */
+ for (uns i = 0; i < stk->count; i++)
+ {
+ stk->start[i].dot_prod = 0;
+ for (uns j = 0; j < IMAGE_VEC_K; j++)
+ stk->start[i].dot_prod += normal[j] * stk->start[i].node->vec.f[j];
+ }
+
+ /* Sort... could be faster, because we only need the median */
+ random_clusters_sort(stk->count, stk->start);
+
+ /* Split in the middle */
+ stk[1].count = stk[0].count >> 1;
+ stk[0].count -= stk[1].count;
+ stk[1].start = stk[0].start;
+ stk[0].start += stk[1].count;
+ stk++;
+ }
+ }
+ log(L_INFO, "Generated %u clusters", random_clusters_count);
+}
+
+static void
+random_clusters_cleanup(void)
+{
+ if (vectors_count)
+ xfree(random_clusters_temp);
+}
+
+/*********************************************************************************/
+
+// FIXME: use vectors_read()... duplicate code
+
+struct signature_record {
+ oid_t oid;
+ struct image_vector vec;
+};
+
+#define ASORT_PREFIX(x) build_search_tree_##x
+#define ASORT_KEY_TYPE struct signature_record *
+#define ASORT_ELT(i) rec[i]
+#define ASORT_LT(x,y) x->vec.f[dim] < y->vec.f[dim]
+#define ASORT_EXTRA_ARGS , uns dim, struct signature_record **rec
+#include "lib/arraysort.h"
+
+#if 0
+#define DBG_KD(x...) DBG(x)
+#else
+#define DBG_KD(x...) do{}while(0)
+#endif
+
+static struct image_tree tree;
+static struct signature_record *records;
+static struct signature_record **precords;
+
+static void
+build_kd_tree(void)
+{
+ log(L_INFO, "Building KD-tree");
+
+ struct fastbuf *fb_signatures = index_bopen("image-sig", O_RDONLY);
+ tree.count = bgetl(fb_signatures);
+ ASSERT(tree.count < 0x80000000);
+ if (!tree.count)
+ {
+ /* FIXME */
+ bclose(fb_signatures);
+ die("There are no signatures");
+ }
+ else
+ {
+ DBG("Reading %d signatures", tree.count);
+ records = xmalloc(tree.count * sizeof(struct signature_record));
+ precords = xmalloc(tree.count * sizeof(void *));
+ for (uns i = 0; i < tree.count; i++)
+ {
+ bread(fb_signatures, &records[i].oid, sizeof(oid_t));
+ bread(fb_signatures, &records[i].vec, sizeof(struct image_vector));
+ uns len = bgetc(fb_signatures);
+ bskip(fb_signatures, len * sizeof(struct image_region));
+ precords[i] = records + i;
+ if (likely(i))
+ for (uns j = 0; j < IMAGE_VEC_K; j++)
+ {
+ tree.bbox.vec[0].f[j] = MIN(tree.bbox.vec[0].f[j], records[i].vec.f[j]);
+ tree.bbox.vec[1].f[j] = MAX(tree.bbox.vec[1].f[j], records[i].vec.f[j]);
+ }
+ else
+ tree.bbox.vec[0] = tree.bbox.vec[1] = records[0].vec;
+ }
+ bclose(fb_signatures);
+
+ for (tree.depth = 1; (uns)(2 << tree.depth) < tree.count; tree.depth++);
+ DBG("depth=%d nodes=%d bbox=[(%s), (%s)]", tree.depth, 1 << tree.depth,
+ stk_print_image_vector(tree.bbox.vec + 0), stk_print_image_vector(tree.bbox.vec + 1));
+ uns leaves_index = 1 << (tree.depth - 1);
+ tree.nodes = xmalloc_zero((1 << tree.depth) * sizeof(struct image_node));
+ tree.leaves = xmalloc_zero(tree.count * sizeof(struct image_leaf));
+
+ /* Initialize recursion */
+ struct stk {
+ struct image_bbox bbox;
+ uns index, count;
+ struct signature_record **start;
+ } stk_top[32], *stk = stk_top + 1;
+ stk->index = 1;
+ stk->start = precords;
+ stk->count = tree.count;
+ stk->bbox.vec[0] = tree.bbox.vec[0];
+ for (uns i = 0; i < IMAGE_VEC_K; i++)
+ stk->bbox.vec[1].f[i] = tree.bbox.vec[1].f[i] - tree.bbox.vec[0].f[i];
+ uns entry_index = 0;
+
+ /* Main loop */
+ while (stk != stk_top)
+ {
+ DBG_KD("Main loop... depth=%d index=%d count=%d, start=%d, min=%s dif=%s",
+ stk - stk_top, stk->index, stk->count, stk->start - precords,
+ stk_print_image_vector(stk->bbox.vec + 0), stk_print_image_vector(stk->bbox.vec + 1));
+ ASSERT(stk->count);
+
+ /* Create leaf node */
+ if (stk->index >= leaves_index || stk->count < 2)
+ {
+ tree.nodes[stk->index].val = IMAGE_NODE_LEAF | entry_index;
+ for (; stk->count--; stk->start++)
+ {
+ struct image_leaf *leaf = &tree.leaves[entry_index++];
+ struct signature_record *record = *stk->start;
+ leaf->oid = record->oid;
+ leaf->flags = 0;
+ for (uns i = IMAGE_VEC_K; i--; )
+ {
+ uns bits = IMAGE_LEAF_BITS(i);
+ leaf->flags <<= bits;
+ if (stk->bbox.vec[1].f[i])
+ {
+ uns value =
+ (record->vec.f[i] - stk->bbox.vec[0].f[i]) *
+ ((1 << bits) - 1) / stk->bbox.vec[1].f[i];
+ ASSERT(value < (uns)(1 << bits));
+ leaf->flags |= value;
+ }
+ }
+ if (!stk->count)
+ leaf->flags |= IMAGE_LEAF_LAST;
+ DBG_KD("Creating leaf node; oid=%d vec=(%s) flags=0x%08x",
+ leaf->oid, stk_print_image_vector(&record->vec), leaf->flags);
+ }
+ stk--;
+ }
+
+ /* Create internal node */
+ else
+ {
+ /* Select dimension to splis */
+ uns dim = 0;
+ for (uns i = 1; i < IMAGE_VEC_K; i++)
+ if (stk->bbox.vec[1].f[i] > stk->bbox.vec[1].f[dim])
+ dim = i;
+
+ /* Sort... FIXME: we only need the median */
+ build_search_tree_sort(stk->count, dim, stk->start);
+
+ /* Split in the middle */
+ uns index = stk->index;
+ stk[1].index = stk[0].index * 2;
+ stk[0].index = stk[1].index + 1;
+ stk[1].count = stk[0].count >> 1;
+ stk[0].count -= stk[1].count;
+ stk[1].start = stk[0].start;
+ stk[0].start += stk[1].count;
+
+ /* Choose split value */
+ uns lval = stk->start[-1]->vec.f[dim];
+ uns rval = stk->start[0]->vec.f[dim];
+ uns pivot = stk->bbox.vec[0].f[dim] + (stk->bbox.vec[1].f[dim] >> 1);
+ if (pivot <= lval)
+ pivot = lval;
+ else if (pivot >= rval)
+ pivot = rval;
+
+ DBG_KD("Created internal node; dim=%d pivot=%d", dim, pivot);
+
+ /* Split the box */
+ stk[1].bbox = stk[0].bbox;
+ stk[1].bbox.vec[1].f[dim] = pivot - stk[0].bbox.vec[0].f[dim];
+ stk[0].bbox.vec[0].f[dim] += stk[1].bbox.vec[1].f[dim];
+ stk[0].bbox.vec[1].f[dim] -= stk[1].bbox.vec[1].f[dim];
+
+ /* Fill the node structure */
+ tree.nodes[index].val = dim + (pivot << 8);
+ stk++;
+ }
+ }
+
+ DBG("Tree constructed, saving...");
+
+ struct fastbuf *fb_tree = index_bopen("image-tree", O_CREAT | O_WRONLY | O_TRUNC);
+ bputl(fb_tree, tree.count);
+ bputl(fb_tree, tree.depth);
+ bwrite(fb_tree, &tree.bbox, sizeof(struct image_bbox));
+ bwrite(fb_tree, tree.nodes + 1, ((1 << tree.depth) - 1) * sizeof(struct image_node));
+ bwrite(fb_tree, tree.leaves, tree.count * sizeof(struct image_leaf));
+ bclose(fb_tree);
+
+ //xfree(tree.leaves);
+ //xfree(tree.nodes);
+ //xfree(precords);
+ //xfree(records);
+ }
+}
+
+/*********************************************************************************/
+
+#if 0
+
+struct pass1_hilbert {
+ u32 index;
+ struct image_vector vec;
+};
+
+struct pass1_node {
+ cnode lru_node;
+ cnode buf_node;
+ uns buf_size;
+ byte *buf;
+ oid_t oid;
+ byte *url;
+ struct image image;
+ struct image_dup dup;
+};
+
+static uns pass1_buf_size = 400 << 20;
+static uns pass1_max_count = 100000;
+static uns pass1_search_dist = 40;
+static uns pass1_search_count = 500;
+
+static struct mempool *pass1_pool;
+static struct pass1_hilbert *pass1_hilbert_list;
+static byte *pass1_buf_start;
+static byte *pass1_buf_pos;
+static uns pass1_buf_free;
+static uns pass1_buf_used;
+static clist pass1_buf_list;
+static clist pass1_lru_list;
+static u64 pass1_lookups;
+static u64 pass1_reads;
+static u64 pass1_pairs;
+static u64 pass1_dups;
+static u64 pass1_shrinks;
+static u64 pass1_alloc_sum;
+
+#define HILBERT_PREFIX(x) pass1_hilbert_##x
+#define HILBERT_TYPE byte
+#define HILBERT_ORDER 8
+#define HILBERT_DIM IMAGE_VEC_K
+#define HILBERT_WANT_ENCODE
+#include "images/hilbert.h"
+
+#define ASORT_PREFIX(x) pass1_hilbert_sort_##x
+#define ASORT_KEY_TYPE struct image_vector *
+#define ASORT_ELT(i) (&pass1_hilbert_list[i].vec)
+#define ASORT_LT(x,y) (memcmp(x, y, sizeof(*x)) < 0)
+#define ASORT_SWAP(i,j) do { struct pass1_hilbert _s; \
+ _s = pass1_hilbert_list[i]; \
+ pass1_hilbert_list[i] = pass1_hilbert_list[j]; \
+ pass1_hilbert_list[j] = _s; } while(0)
+#include "lib/arraysort.h"
+
+static void
+pass1_hilbert_sort(void)
+{
+ DBG("Computing positions on the Hilbert curve");
+ pass1_hilbert_list = xmalloc(tree.count * sizeof(struct pass1_hilbert));
+ for (uns i = 0; i < tree.count; i++)
+ {
+ struct pass1_hilbert *h = pass1_hilbert_list + i;
+ h->index = i;
+ byte vec[IMAGE_VEC_K];
+ pass1_hilbert_encode(vec, precords[i]->vec.f);
+ for (uns j = 0; j < IMAGE_VEC_K; j++)
+ h->vec.f[j] = vec[IMAGE_VEC_K - 1 - j];
+ }
+ DBG("Sorting signatures in order of incresing parameters on the Hilbert curve");
+ pass1_hilbert_sort_sort(tree.count);
+#if 0
+ for (uns i = 0; i < tree.count; i++)
+ {
+ if (i)
+ {
+ byte *v1 = precords[pass1_hilbert_list[i - 1].index]->vec.f;
+ byte *v2 = precords[pass1_hilbert_list[i].index]->vec.f;
+#define SQR(x) ((x)*(x))
+ uns dist = 0;
+ for (uns j = 0; j < 6; j++)
+ dist += SQR(v1[j] - v2[j]);
+ DBG("dist %d", dist);
+ }
+ DBG("index %d", pass1_hilbert_list[i].index);
+ }
+#endif
+}
+
+static void
+pass1_hilbert_cleanup(void)
+{
+ xfree(pass1_hilbert_list);
+}
+
+#define HASH_PREFIX(x) pass1_hash_##x
+#define HASH_NODE struct pass1_node
+#define HASH_KEY_ATOMIC oid
+#define HASH_WANT_CLEANUP
+#define HASH_WANT_FIND
+#define HASH_WANT_NEW
+#define HASH_WANT_REMOVE
+#include "lib/hashtable.h"
+
+static inline void
+pass1_buf_init(void)
+{
+ //DBG("pass1_buf_init()");
+ pass1_buf_free = pass1_buf_size;
+ pass1_buf_start = pass1_buf_pos = xmalloc(pass1_buf_size);
+ pass1_buf_used = 0;
+}
+
+static inline void
+pass1_buf_cleanup(void)
+{
+ //DBG("pass1_buf_cleanup()");
+ xfree(pass1_buf_start);
+}
+
+static void
+pass1_node_free(struct pass1_node *node)
+{
+ //DBG("pass1_node_free(%d)", (uns)node->oid);
+ if (node->buf_size)
+ {
+ pass1_buf_used -= node->buf_size;
+ clist_remove(&node->buf_node);
+ }
+ clist_remove(&node->lru_node);
+ pass1_hash_remove(node);
+}
+
+static inline void
+pass1_node_free_lru(void)
+{
+ ASSERT(!clist_empty(&pass1_lru_list));
+ pass1_node_free(SKIP_BACK(struct pass1_node, lru_node, clist_head(&pass1_lru_list)));
+}
+
+static inline void
+pass1_node_after_move(struct pass1_node *node, addr_int_t move)
+{
+ //DBG("pass1_node_after_mode(%d, %d)", (uns)node->oid, (uns)move);
+ /* adjust internal pointers */
+#define MOVE(x) x = (byte *)(x) - move
+ MOVE(node->url);
+ MOVE(node->image.pixels);
+ MOVE(node->dup.buf);
+#undef MOVE
+}
+
+static inline void
+pass1_buf_shrink(void)
+{
+ DBG("pass1_buf_shrink()");
+ pass1_shrinks++;
+ pass1_buf_free = pass1_buf_size;
+ pass1_buf_pos = pass1_buf_start;
+ CLIST_FOR_EACH(void *, p, pass1_buf_list)
+ {
+ struct pass1_node *node = SKIP_BACK(struct pass1_node, buf_node, p);
+ if (node->buf != pass1_buf_pos)
+ {
+ memmove(pass1_buf_pos, node->buf, node->buf_size);
+ pass1_node_after_move(node, node->buf - pass1_buf_pos);
+ node->buf = pass1_buf_pos;
+ }
+ pass1_buf_pos += node->buf_size;
+ pass1_buf_free -= node->buf_size;
+ }
+}
+
+static void *
+pass1_buf_alloc(uns size)
+{
+ //DBG("pass1_buf_alloc(%d)", size);
+
+ /* if there is not enough free space at the end of the buffer */
+ if (size > pass1_buf_free)
+ {
+ /* free some lru nodes */
+ //DBG("freeing lru nodes");
+ while (size > pass1_buf_size - pass1_buf_used || pass1_buf_used > pass1_buf_size / 2)
+ {
+ if (unlikely(clist_empty(&pass1_lru_list))) // FIXME
+ die("Buffer too small");
+ pass1_node_free_lru();
+ }
+
+ pass1_buf_shrink();
+ }
+
+ /* final allocation */
+ void *result = pass1_buf_pos;
+ pass1_buf_pos += size;
+ pass1_buf_free -= size;
+ pass1_buf_used += size;
+ pass1_alloc_sum += size;
+ return result;
+}
+
+static struct pass1_node *
+pass1_node_new(oid_t oid)
+{
+ DBG("pass1_node_new(%d)", (uns)oid);
+ if (pass1_hash_table.hash_count == pass1_max_count)
+ pass1_node_free_lru();
+ struct pass1_node *node = pass1_hash_new(oid);
+ mp_flush(pass1_pool);
+ pass1_reads++;
+
+ /* read object */
+ struct card_attr ca;
+ bsetpos(fb_card_attrs, (sh_off_t)oid * sizeof(ca)); /* FIXME: these seeks can be easily removed */
+ bread(fb_card_attrs, &ca, sizeof(ca));
+
+ bsetpos(fb_cards, (sh_off_t)ca.card << CARD_POS_SHIFT); /* FIXME: maybe a presort should handle these random seeks */
+ uns buck_len = bgetl(fb_cards) - (LIZARD_COMPRESS_HEADER - 1);
+ uns buck_type = bgetc(fb_cards) + BUCKET_TYPE_PLAIN;
+ struct odes *obj = obj_read_bucket(buck2obj, pass1_pool, buck_type, buck_len, fb_cards, NULL);
+ if (unlikely(!obj))
+ die("Failed to read card");
+ byte *url = obj_find_aval(obj_find_attr(obj, 'U' + OBJ_ATTR_SON)->son, 'U');
+ uns url_len = strlen(url);
+
+ /* decompress thumbnail */
+ struct image_obj imo;
+ imo_init(&imo, pass1_pool, obj);
+ if (unlikely(!imo_decompress_thumbnail(&imo)))
+ die("Cannot decompress thumbnail");
+ node->image = imo.thumb;
+
+ /* create duplicates comparision object */
+ image_dup_init(&node->dup, &node->image, pass1_pool);
+
+ /* copy data */
+ //DBG("loaded image %s s=%d d=%d", url, node->image.size, node->dup.buf_size);
+ node->buf_size = node->image.size + node->dup.buf_size + url_len + 1;
+ if (node->buf_size)
+ {
+ byte *buf = node->buf = pass1_buf_alloc(node->buf_size);
+ clist_add_tail(&pass1_buf_list, &node->buf_node);
+#define COPY(ptr, size) ({ void *_p=buf; uns _size=(size); buf+=_size; memcpy(_p,(ptr),_size); _p; })
+ node->url = COPY(url, url_len + 1);
+ node->image.pixels = COPY(node->image.pixels, node->image.size);
+ node->dup.buf = COPY(node->dup.buf, node->dup.buf_size);
+#undef COPY
+ }
+
+ /* add to lru list */
+ return node;
+}