+static void
+pass1_hilbert_cleanup(void)
+{
+ xfree(pass1_hilbert_list);
+}
+
+#define HASH_PREFIX(x) pass1_hash_##x
+#define HASH_NODE struct pass1_node
+#define HASH_KEY_ATOMIC oid
+#define HASH_WANT_CLEANUP
+#define HASH_WANT_FIND
+#define HASH_WANT_NEW
+#define HASH_WANT_REMOVE
+#include "lib/hashtable.h"
+
+static inline void
+pass1_buf_init(void)
+{
+ //DBG("pass1_buf_init()");
+ pass1_buf_free = pass1_buf_size;
+ pass1_buf_start = pass1_buf_pos = xmalloc(pass1_buf_size);
+ pass1_buf_used = 0;
+}
+
+static inline void
+pass1_buf_cleanup(void)
+{
+ //DBG("pass1_buf_cleanup()");
+ xfree(pass1_buf_start);
+}
+
+static void
+pass1_node_free(struct pass1_node *node)
+{
+ //DBG("pass1_node_free(%d)", (uns)node->oid);
+ if (node->buf_size)
+ {
+ pass1_buf_used -= node->buf_size;
+ clist_remove(&node->buf_node);
+ }
+ clist_remove(&node->lru_node);
+ pass1_hash_remove(node);
+}
+
+static inline void
+pass1_node_free_lru(void)
+{
+ ASSERT(!clist_empty(&pass1_lru_list));
+ pass1_node_free(SKIP_BACK(struct pass1_node, lru_node, clist_head(&pass1_lru_list)));
+}
+
+static inline void
+pass1_node_after_move(struct pass1_node *node, addr_int_t move)
+{
+ //DBG("pass1_node_after_mode(%d, %d)", (uns)node->oid, (uns)move);
+ /* adjust internal pointers */
+#define MOVE(x) x = (byte *)(x) - move
+ MOVE(node->url);
+ MOVE(node->image.pixels);
+ MOVE(node->dup.buf);
+#undef MOVE
+}
+
+static inline void
+pass1_buf_shrink(void)
+{
+ DBG("pass1_buf_shrink()");
+ pass1_shrinks++;
+ pass1_buf_free = pass1_buf_size;
+ pass1_buf_pos = pass1_buf_start;
+ CLIST_FOR_EACH(void *, p, pass1_buf_list)
+ {
+ struct pass1_node *node = SKIP_BACK(struct pass1_node, buf_node, p);
+ if (node->buf != pass1_buf_pos)
+ {
+ memmove(pass1_buf_pos, node->buf, node->buf_size);
+ pass1_node_after_move(node, node->buf - pass1_buf_pos);
+ node->buf = pass1_buf_pos;
+ }
+ pass1_buf_pos += node->buf_size;
+ pass1_buf_free -= node->buf_size;
+ }
+}
+
+static void *
+pass1_buf_alloc(uns size)
+{
+ //DBG("pass1_buf_alloc(%d)", size);
+
+ /* if there is not enough free space at the end of the buffer */
+ if (size > pass1_buf_free)
+ {
+ /* free some lru nodes */
+ //DBG("freeing lru nodes");
+ while (size > pass1_buf_size - pass1_buf_used || pass1_buf_used > pass1_buf_size / 2)
+ {
+ if (unlikely(clist_empty(&pass1_lru_list))) // FIXME
+ die("Buffer too small");
+ pass1_node_free_lru();
+ }
+
+ pass1_buf_shrink();
+ }
+
+ /* final allocation */
+ void *result = pass1_buf_pos;
+ pass1_buf_pos += size;
+ pass1_buf_free -= size;
+ pass1_buf_used += size;
+ pass1_alloc_sum += size;
+ return result;
+}
+
+static struct pass1_node *
+pass1_node_new(oid_t oid)
+{
+ DBG("pass1_node_new(%d)", (uns)oid);
+ if (pass1_hash_table.hash_count == pass1_max_count)
+ pass1_node_free_lru();
+ struct pass1_node *node = pass1_hash_new(oid);
+ mp_flush(pass1_pool);
+ pass1_reads++;
+
+ /* read object */
+ struct card_attr ca;
+ bsetpos(fb_card_attrs, (sh_off_t)oid * sizeof(ca)); /* FIXME: these seeks can be easily removed */
+ bread(fb_card_attrs, &ca, sizeof(ca));
+
+ bsetpos(fb_cards, (sh_off_t)ca.card << CARD_POS_SHIFT); /* FIXME: maybe a presort should handle these random seeks */
+ uns buck_len = bgetl(fb_cards) - (LIZARD_COMPRESS_HEADER - 1);
+ uns buck_type = bgetc(fb_cards) + BUCKET_TYPE_PLAIN;
+ struct odes *obj = obj_read_bucket(buck2obj, pass1_pool, buck_type, buck_len, fb_cards, NULL);
+ if (unlikely(!obj))
+ die("Failed to read card");
+ byte *url = obj_find_aval(obj_find_attr(obj, 'U' + OBJ_ATTR_SON)->son, 'U');
+ uns url_len = strlen(url);
+
+ /* decompress thumbnail */
+ struct image_obj imo;
+ imo_init(&imo, pass1_pool, obj);
+ if (unlikely(!imo_decompress_thumbnail(&imo)))
+ die("Cannot decompress thumbnail");
+ node->image = imo.thumb;
+
+ /* create duplicates comparision object */
+ image_dup_init(&node->dup, &node->image, pass1_pool);
+
+ /* copy data */
+ //DBG("loaded image %s s=%d d=%d", url, node->image.size, node->dup.buf_size);
+ node->buf_size = node->image.size + node->dup.buf_size + url_len + 1;
+ if (node->buf_size)
+ {
+ byte *buf = node->buf = pass1_buf_alloc(node->buf_size);
+ clist_add_tail(&pass1_buf_list, &node->buf_node);
+#define COPY(ptr, size) ({ void *_p=buf; uns _size=(size); buf+=_size; memcpy(_p,(ptr),_size); _p; })
+ node->url = COPY(url, url_len + 1);
+ node->image.pixels = COPY(node->image.pixels, node->image.size);
+ node->dup.buf = COPY(node->dup.buf, node->dup.buf_size);
+#undef COPY
+ }
+
+ /* add to lru list */
+ return node;
+}
+
+static inline struct pass1_node *
+pass1_node_lock(oid_t oid)
+{
+ DBG("pass1_node_lock(%d)", (uns)oid);
+ pass1_lookups++;
+ struct pass1_node *node = pass1_hash_find(oid);
+ if (node)
+ {
+ clist_remove(&node->lru_node);
+ return node;
+ }
+ else
+ return pass1_node_new(oid);
+}
+
+static inline void
+pass1_node_unlock(struct pass1_node *node)
+{
+ //DBG("pass1_node_unlock(%d)", (uns)node->oid);
+ clist_add_tail(&pass1_lru_list, &node->lru_node);
+}
+
+static void
+pass1_show_stats(void)
+{
+ log(L_INFO, "%d count, %Ld lookups, %Ld reads, %Ld pairs, %Ld dups, %Ld shrinks", tree.count,
+ (long long int)pass1_lookups, (long long int)pass1_reads,
+ (long long int)pass1_pairs, (long long int)pass1_dups, (long long int)pass1_shrinks);
+}
+
+static void
+pass1(void)
+{
+ log(L_INFO, "Looking for duplicates");
+ ASSERT(tree.nodes);
+
+ /* initialization */
+ pass1_lookups = pass1_reads = pass1_pairs = pass1_dups = pass1_shrinks = pass1_alloc_sum = 0;
+ fb_cards = bopen("index/cards", O_RDONLY, 10000); // FIXME
+ fb_card_attrs = bopen("index/card-attrs", O_RDONLY, sizeof(struct card_attr)); // FIXME
+ buck2obj = buck2obj_alloc();
+ imo_decompress_thumbnails_init();
+ clist_init(&pass1_lru_list);
+ clist_init(&pass1_buf_list);
+ pass1_hash_init();
+ pass1_buf_init();
+ pass1_pool = mp_new(1 << 20);
+
+ /* Hilbert sort */
+ pass1_hilbert_sort();
+ pass1_hilbert_cleanup();
+
+ /* main loop */
+ for (uns i = 0; i < tree.count; )
+ {
+ /* lookup next image */
+ oid_t oid = tree.leaves[i].oid;
+ struct pass1_node *node = pass1_node_lock(oid);
+
+ /* compare with all near images */
+ struct image_search search;
+ image_search_init(&search, &tree, &precords[i]->vec, pass1_search_dist);
+ /* FIXME: can be faster than general search in KD-tree */
+ oid_t oid2;
+ uns dist;
+ for (uns j = 0; j < pass1_search_count && image_search_next(&search, &oid2, &dist); j++)
+ {
+ if (oid < oid2)
+ {
+ struct pass1_node *node2 = pass1_node_lock(oid2);
+ DBG("comparing %d and %d", oid, oid2);
+ if (image_dup_compare(&node->dup, &node2->dup, IMAGE_DUP_TRANS_ID))
+ {
+ pass1_dups++;
+ log(L_DEBUG, "*** Found duplicates oid1=0x%x oid=0x%x", (uns)node->oid, (uns)node2->oid);
+ log(L_DEBUG, " %s", node->url);
+ log(L_DEBUG, " %s", node2->url);
+ }
+ pass1_pairs++;
+ pass1_node_unlock(node2);
+ }
+ }
+ image_search_done(&search);
+ pass1_node_unlock(node);
+ i++;
+ if (i % 1000 == 0)
+ log(L_DEBUG, "... passed %d images", i);
+ }
+
+ /* clean up */
+ pass1_hash_cleanup();
+ pass1_buf_cleanup();
+ mp_delete(pass1_pool);
+ bclose(fb_cards);
+ bclose(fb_card_attrs);
+ buck2obj_free(buck2obj);
+ imo_decompress_thumbnails_done();
+
+ /* print statistics */
+ pass1_show_stats();
+}
+
+/*********************************************************************************/
+
+static uns pass2_clusterings_count = 1;
+
+static void
+pass2_estimate_sizes(void)
+{
+ if (!vectors_count)
+ return;
+ log(L_DEBUG, "Reading image sizes");
+
+ /* FIXME: hack, these reads are not necessary, can be done in previous phases */
+ struct fastbuf *fb_cards = index_bopen("cards", O_RDONLY);
+ struct fastbuf *fb_card_attrs = index_bopen("card-attrs", O_RDONLY);
+ struct mempool *pool = mp_new(1 << 16);
+ struct buck2obj_buf *bob = buck2obj_alloc();
+
+ for (uns i = 0; i < vectors_count; i++)
+ {
+ oid_t oid = vectors[i].oid;
+ struct card_attr ca;
+ bsetpos(fb_card_attrs, (sh_off_t)oid * sizeof(ca));
+ bread(fb_card_attrs, &ca, sizeof(ca));
+ bsetpos(fb_cards, (sh_off_t)ca.card << CARD_POS_SHIFT);
+ uns buck_len = bgetl(fb_cards) - (LIZARD_COMPRESS_HEADER - 1);
+ uns buck_type = bgetc(fb_cards) + BUCKET_TYPE_PLAIN;
+ mp_flush(pool);
+ struct odes *obj = obj_read_bucket(bob, pool, buck_type, buck_len, fb_cards, NULL);
+ byte *attr = obj_find_aval(obj, 'G');
+ ASSERT(attr);
+ uns image_width, image_height, image_colors, thumb_width, thumb_height;
+ byte color_space[MAX_ATTR_SIZE];
+ sscanf(attr, "%d%d%s%d%d%d", &image_width, &image_height, color_space, &image_colors, &thumb_width, &thumb_height);
+ vectors[i].temp = image_dup_estimate_size(thumb_width, thumb_height) +
+ sizeof(struct image) + thumb_width * thumb_height * 3;
+ }
+ buck2obj_free(bob);
+ mp_delete(pool);
+ bclose(fb_cards);
+ bclose(fb_card_attrs);
+}
+
+static void
+pass2(void)
+{
+ // FIXME: presorts, much allocated memory when not needed
+ vectors_read();
+ pass2_estimate_sizes();
+ random_clusters_init();
+ for (uns clustering = 0; clustering < pass2_clusterings_count; clustering++)
+ {
+ random_clusters_build();
+ // FIXME
+ // - external sort
+ // - generate and compare pairs in clusters
+ }
+ random_clusters_cleanup();
+ vectors_cleanup();
+}
+#endif
+
+/*********************************************************************************/