X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=images%2Fimage-idx.c;h=9e5ece0fd962b2b2aa1c28f01e709f3696fc7d12;hb=97df3ea4bd85cae18c59f569000328202c685d1c;hp=058c7774e92c4d0c4b7db44a4fcb8ac9a2db78d0;hpb=3641e85c1412a524ed541360d9fa35268a6e83e0;p=libucw.git diff --git a/images/image-idx.c b/images/image-idx.c index 058c7774..9e5ece0f 100644 --- a/images/image-idx.c +++ b/images/image-idx.c @@ -1,8 +1,11 @@ -#define LOCAL_DEBUG +// FIXME: this file is full of experiments... will be completely different in final version + +#undef LOCAL_DEBUG #include "sherlock/sherlock.h" #include "lib/mempool.h" #include "lib/conf.h" +#include "lib/getopt.h" #include "lib/fastbuf.h" #include "lib/chartype.h" #include "sherlock/object.h" @@ -20,20 +23,29 @@ #include "lang/lang.h" #include "lib/base224.h" #include "lib/bbuf.h" +#include "lib/clists.h" #include "images/images.h" -#include "images/image-thumb.h" +#include "images/image-obj.h" +#include "images/image-sig.h" +#include "images/dup-cmp.h" +#include "images/kd-tree.h" +#include "images/color.h" #include #include #include +static struct fastbuf *fb_cards; +static struct fastbuf *fb_card_attrs; +static struct buck2obj_buf *buck2obj; + /* This should happen in gatherer or scanner */ -UNUSED static void +static void generate_signatures(uns limit) { - struct fastbuf *fb_cards = index_bopen("cards", O_RDONLY); - struct fastbuf *fb_card_attrs = index_bopen("card-attrs", O_RDONLY); + fb_cards = index_bopen("cards", O_RDONLY); + fb_card_attrs = index_bopen("card-attrs", O_RDONLY); struct fastbuf *fb_signatures = index_bopen("image-sig", O_CREAT | O_WRONLY | O_TRUNC); struct card_attr ca; struct image_signature sig; @@ -41,9 +53,12 @@ generate_signatures(uns limit) struct buck2obj_buf *bob = buck2obj_alloc(); uns count = 0; - log(L_INFO, "Generating image signatures"); + if (limit == ~0U) + log(L_INFO, "Generating image signatures"); + else + log(L_INFO, "Generating at most %d image signatures", limit); bputl(fb_signatures, 0); - compute_image_signature_prepare(); + imo_decompress_thumbnails_init(); for (oid_t oid = 0; bread(fb_card_attrs, &ca, sizeof(ca)); oid++) if ((uns)((ca.type_flags >> 4) - 8) < 4) @@ -58,24 +73,15 @@ generate_signatures(uns limit) die("Failed to read card"); if (attr = obj_find_attr(obj, 'N')) { - DBG("Reading oid=%d url=%s", oid, obj_find_aval(obj_find_attr(obj, 'U' + OBJ_ATTR_SON)->son, 'U')); - /*bb_t buf; - uns buf_len = 0; - bb_init(&buf); - for (; attr; attr = attr->same) - { - uns len = strlen(attr->val); - bb_grow(&buf, buf_len + len); - memcpy(buf.ptr + buf_len, attr->val, len); - buf_len += len; - } - byte thumb[buf_len]; - uns thumb_len = base224_decode(thumb, buf.ptr, buf_len);*/ - struct image thumb; - int err; - if (!(err = decompress_thumbnail(obj, pool, &thumb))) +#ifdef LOCAL_DEBUG + byte *url = obj_find_aval(obj_find_attr(obj, 'U' + OBJ_ATTR_SON)->son, 'U'); + DBG("Reading oid=%d url=%s", oid, url); +#endif + struct image_obj imo; + imo_init(&imo, pool, obj); + if (imo_decompress_thumbnail(&imo)) { - if (!(err = compute_image_signature(&thumb, &sig))) + if (compute_image_signature(&imo.thumb, &sig)) { bwrite(fb_signatures, &oid, sizeof(oid)); bwrite(fb_signatures, &sig.vec, sizeof(struct image_vector)); @@ -83,21 +89,23 @@ generate_signatures(uns limit) if (sig.len) bwrite(fb_signatures, sig.reg, sig.len * sizeof(struct image_region)); count++; + if (count % 10000 == 0) + log(L_DEBUG, "... passed %d images", count); if (count >= limit) break; } else - DBG("Cannot create signature, error=%d", err); + DBG("Cannot create signature"); } else - DBG("Cannot decompress thumbnail, error=%d", err); + DBG("Cannot decompress thumbnail"); } } brewind(fb_signatures); bputl(fb_signatures, count); DBG("%d signatures written", count); - compute_image_signature_finish(); + imo_decompress_thumbnails_done(); buck2obj_free(bob); mp_delete(pool); bclose(fb_cards); @@ -105,25 +113,162 @@ generate_signatures(uns limit) bclose(fb_signatures); } -UNUSED static void -generate_random_signatures(uns count) +/*********************************************************************************/ + +struct vectors_node { + oid_t oid; + u32 temp; + struct image_vector vec; +}; + +static uns vectors_count; +static struct vectors_node *vectors; + +static void +vectors_read(void) { - log(L_INFO, "Generating %d random signatures", count); - struct fastbuf *fb_signatures = index_bopen("image-sig", O_CREAT | O_WRONLY | O_TRUNC); - bputl(fb_signatures, count); - for (uns i = 0; i < count; i++) + log(L_DEBUG, "Reading signature vectors"); + struct fastbuf *fb = index_bopen("image-sig", O_RDONLY); + vectors_count = bgetl(fb); + if (vectors_count) { - oid_t oid = i; - struct image_vector vec; - for (uns j = 0; j < IMAGE_VEC_K; j++) - vec.f[j] = random_max(256); - bwrite(fb_signatures, &oid, sizeof(oid)); - bwrite(fb_signatures, &vec, sizeof(vec)); - bputc(fb_signatures, 0); + vectors = xmalloc(vectors_count * sizeof(struct vectors_node)); + for (uns i = 0; i < vectors_count; i++) + { + bread(fb, &vectors[i].oid, sizeof(oid_t)); + bread(fb, &vectors[i].vec, sizeof(struct image_vector)); + bskip(fb, bgetc(fb) * sizeof(struct image_region)); + } + } + bclose(fb); +} + +static void +vectors_cleanup(void) +{ + log(L_DEBUG, "Freeing signature vectors"); + if (vectors_count) + xfree(vectors); +} + +/*********************************************************************************/ + +static u64 random_clusters_max_size = 500000; +static uns random_clusters_max_count = 1000; + +#define RANDOM_CLUSTERS_SIZE 0x7fffffff +#define RANDOM_CLUSTERS_LAST 0x80000000 + +static struct random_clusters_node { + struct vectors_node *node; + s32 dot_prod; +} *random_clusters_temp; +static uns random_clusters_count; + +#define ASORT_PREFIX(x) random_clusters_##x +#define ASORT_KEY_TYPE s32 +#define ASORT_ELT(i) start[i].dot_prod +#define ASORT_SWAP(i,j) do { struct random_clusters_node _s = start[i]; start[i] = start[j]; start[j] = _s; } while(0) +#define ASORT_EXTRA_ARGS , struct random_clusters_node *start +#include "lib/arraysort.h" + +static void +random_clusters_init(void) +{ + if (!vectors_count) + return; + log(L_INFO, "Initializing random clusters generator"); + random_clusters_temp = xmalloc(vectors_count * sizeof(struct random_clusters_node)); + for (uns i = 0; i < vectors_count; i++) + random_clusters_temp[i].node = vectors + i; +} + +static void +random_clusters_build(void) +{ + random_clusters_count = 0; + if (!vectors_count) + return; + + log(L_INFO, "Generating random clusters for duplicates comparision"); + + for (uns i = 0; i < vectors_count; i++) + vectors[i].temp &= RANDOM_CLUSTERS_SIZE; + + /* Initialize recursion */ + struct stk { + uns count; + struct random_clusters_node *start; + } stk_top[64], *stk = stk_top + 1; + stk->start = random_clusters_temp; + stk->count = vectors_count; + + /* Main loop */ + while (stk != stk_top) + { + /* Split conditions */ + uns split; + if (stk->count < 2) + split = 0; + else if (stk->count > random_clusters_max_count) + split = 1; + else + { + s64 size = random_clusters_max_size; + for (uns i = 0; i < stk->count && size >= 0; i++) + size -= stk->start[i].node->temp; + split = size < 0; + } + + /* BSP leaf node */ + if (!split) + { + stk->start[stk->count - 1].node->temp |= RANDOM_CLUSTERS_LAST; + random_clusters_count++; + stk--; + } + + /* BSP internal node */ + else + { + /* Generate random normal vector of the splitting plane */ + int normal[IMAGE_VEC_K]; + for (uns i = 0; i < IMAGE_VEC_K; i++) + normal[i] = random_max(0x20001) - 0x10000; + + /* Compute dot produts */ + for (uns i = 0; i < stk->count; i++) + { + stk->start[i].dot_prod = 0; + for (uns j = 0; j < IMAGE_VEC_K; j++) + stk->start[i].dot_prod += normal[j] * stk->start[i].node->vec.f[j]; + } + + /* Sort... could be faster, because we only need the median */ + random_clusters_sort(stk->count, stk->start); + + /* Split in the middle */ + stk[1].count = stk[0].count >> 1; + stk[0].count -= stk[1].count; + stk[1].start = stk[0].start; + stk[0].start += stk[1].count; + stk++; + } } - bclose(fb_signatures); + log(L_INFO, "Generated %u clusters", random_clusters_count); +} + +static void +random_clusters_cleanup(void) +{ + if (vectors_count) + xfree(random_clusters_temp); } +/*********************************************************************************/ + +// FIXME: use vectors_read()... duplicate code + struct signature_record { oid_t oid; struct image_vector vec; @@ -142,13 +287,16 @@ struct signature_record { #define DBG_KD(x...) do{}while(0) #endif +static struct image_tree tree; +static struct signature_record *records; +static struct signature_record **precords; + static void -build_search_tree(void) +build_kd_tree(void) { log(L_INFO, "Building KD-tree"); struct fastbuf *fb_signatures = index_bopen("image-sig", O_RDONLY); - struct image_tree tree; tree.count = bgetl(fb_signatures); ASSERT(tree.count < 0x80000000); if (!tree.count) @@ -160,8 +308,8 @@ build_search_tree(void) else { DBG("Reading %d signatures", tree.count); - struct signature_record *records = xmalloc(tree.count * sizeof(struct signature_record)); - struct signature_record **precords = xmalloc(tree.count * sizeof(void *)); + records = xmalloc(tree.count * sizeof(struct signature_record)); + precords = xmalloc(tree.count * sizeof(void *)); for (uns i = 0; i < tree.count; i++) { bread(fb_signatures, &records[i].oid, sizeof(oid_t)); @@ -179,14 +327,14 @@ build_search_tree(void) tree.bbox.vec[0] = tree.bbox.vec[1] = records[0].vec; } bclose(fb_signatures); - + for (tree.depth = 1; (uns)(2 << tree.depth) < tree.count; tree.depth++); - DBG("depth=%d nodes=%d bbox=[(%s), (%s)]", tree.depth, 1 << tree.depth, + DBG("depth=%d nodes=%d bbox=[(%s), (%s)]", tree.depth, 1 << tree.depth, stk_print_image_vector(tree.bbox.vec + 0), stk_print_image_vector(tree.bbox.vec + 1)); uns leaves_index = 1 << (tree.depth - 1); tree.nodes = xmalloc_zero((1 << tree.depth) * sizeof(struct image_node)); tree.leaves = xmalloc_zero(tree.count * sizeof(struct image_leaf)); - + /* Initialize recursion */ struct stk { struct image_bbox bbox; @@ -200,7 +348,7 @@ build_search_tree(void) for (uns i = 0; i < IMAGE_VEC_K; i++) stk->bbox.vec[1].f[i] = tree.bbox.vec[1].f[i] - tree.bbox.vec[0].f[i]; uns entry_index = 0; - + /* Main loop */ while (stk != stk_top) { @@ -208,7 +356,7 @@ build_search_tree(void) stk - stk_top, stk->index, stk->count, stk->start - precords, stk_print_image_vector(stk->bbox.vec + 0), stk_print_image_vector(stk->bbox.vec + 1)); ASSERT(stk->count); - + /* Create leaf node */ if (stk->index >= leaves_index || stk->count < 2) { @@ -226,20 +374,20 @@ build_search_tree(void) if (stk->bbox.vec[1].f[i]) { uns value = - (record->vec.f[i] - stk->bbox.vec[0].f[i]) * + (record->vec.f[i] - stk->bbox.vec[0].f[i]) * ((1 << bits) - 1) / stk->bbox.vec[1].f[i]; ASSERT(value < (uns)(1 << bits)); leaf->flags |= value; } } if (!stk->count) - leaf->flags |= IMAGE_LEAF_LAST; - DBG_KD("Creating leaf node; oid=%d vec=(%s) flags=0x%08x", + leaf->flags |= IMAGE_LEAF_LAST; + DBG_KD("Creating leaf node; oid=%d vec=(%s) flags=0x%08x", leaf->oid, stk_print_image_vector(&record->vec), leaf->flags); } stk--; } - + /* Create internal node */ else { @@ -251,7 +399,7 @@ build_search_tree(void) /* Sort... FIXME: we only need the median */ build_search_tree_sort(stk->count, dim, stk->start); - + /* Split in the middle */ uns index = stk->index; stk[1].index = stk[0].index * 2; @@ -294,13 +442,433 @@ build_search_tree(void) bwrite(fb_tree, tree.leaves, tree.count * sizeof(struct image_leaf)); bclose(fb_tree); - xfree(tree.leaves); - xfree(tree.nodes); - xfree(precords); - xfree(records); + //xfree(tree.leaves); + //xfree(tree.nodes); + //xfree(precords); + //xfree(records); + } +} + +/*********************************************************************************/ + +#if 0 + +struct pass1_hilbert { + u32 index; + struct image_vector vec; +}; + +struct pass1_node { + cnode lru_node; + cnode buf_node; + uns buf_size; + byte *buf; + oid_t oid; + byte *url; + struct image image; + struct image_dup dup; +}; + +static uns pass1_buf_size = 400 << 20; +static uns pass1_max_count = 100000; +static uns pass1_search_dist = 40; +static uns pass1_search_count = 500; + +static struct mempool *pass1_pool; +static struct pass1_hilbert *pass1_hilbert_list; +static byte *pass1_buf_start; +static byte *pass1_buf_pos; +static uns pass1_buf_free; +static uns pass1_buf_used; +static clist pass1_buf_list; +static clist pass1_lru_list; +static u64 pass1_lookups; +static u64 pass1_reads; +static u64 pass1_pairs; +static u64 pass1_dups; +static u64 pass1_shrinks; +static u64 pass1_alloc_sum; + +#define HILBERT_PREFIX(x) pass1_hilbert_##x +#define HILBERT_TYPE byte +#define HILBERT_ORDER 8 +#define HILBERT_DIM IMAGE_VEC_K +#define HILBERT_WANT_ENCODE +#include "images/hilbert.h" + +#define ASORT_PREFIX(x) pass1_hilbert_sort_##x +#define ASORT_KEY_TYPE struct image_vector * +#define ASORT_ELT(i) (&pass1_hilbert_list[i].vec) +#define ASORT_LT(x,y) (memcmp(x, y, sizeof(*x)) < 0) +#define ASORT_SWAP(i,j) do { struct pass1_hilbert _s; \ + _s = pass1_hilbert_list[i]; \ + pass1_hilbert_list[i] = pass1_hilbert_list[j]; \ + pass1_hilbert_list[j] = _s; } while(0) +#include "lib/arraysort.h" + +static void +pass1_hilbert_sort(void) +{ + DBG("Computing positions on the Hilbert curve"); + pass1_hilbert_list = xmalloc(tree.count * sizeof(struct pass1_hilbert)); + for (uns i = 0; i < tree.count; i++) + { + struct pass1_hilbert *h = pass1_hilbert_list + i; + h->index = i; + byte vec[IMAGE_VEC_K]; + pass1_hilbert_encode(vec, precords[i]->vec.f); + for (uns j = 0; j < IMAGE_VEC_K; j++) + h->vec.f[j] = vec[IMAGE_VEC_K - 1 - j]; } + DBG("Sorting signatures in order of incresing parameters on the Hilbert curve"); + pass1_hilbert_sort_sort(tree.count); +#if 0 + for (uns i = 0; i < tree.count; i++) + { + if (i) + { + byte *v1 = precords[pass1_hilbert_list[i - 1].index]->vec.f; + byte *v2 = precords[pass1_hilbert_list[i].index]->vec.f; +#define SQR(x) ((x)*(x)) + uns dist = 0; + for (uns j = 0; j < 6; j++) + dist += SQR(v1[j] - v2[j]); + DBG("dist %d", dist); + } + DBG("index %d", pass1_hilbert_list[i].index); + } +#endif +} + +static void +pass1_hilbert_cleanup(void) +{ + xfree(pass1_hilbert_list); } +#define HASH_PREFIX(x) pass1_hash_##x +#define HASH_NODE struct pass1_node +#define HASH_KEY_ATOMIC oid +#define HASH_WANT_CLEANUP +#define HASH_WANT_FIND +#define HASH_WANT_NEW +#define HASH_WANT_REMOVE +#include "lib/hashtable.h" + +static inline void +pass1_buf_init(void) +{ + //DBG("pass1_buf_init()"); + pass1_buf_free = pass1_buf_size; + pass1_buf_start = pass1_buf_pos = xmalloc(pass1_buf_size); + pass1_buf_used = 0; +} + +static inline void +pass1_buf_cleanup(void) +{ + //DBG("pass1_buf_cleanup()"); + xfree(pass1_buf_start); +} + +static void +pass1_node_free(struct pass1_node *node) +{ + //DBG("pass1_node_free(%d)", (uns)node->oid); + if (node->buf_size) + { + pass1_buf_used -= node->buf_size; + clist_remove(&node->buf_node); + } + clist_remove(&node->lru_node); + pass1_hash_remove(node); +} + +static inline void +pass1_node_free_lru(void) +{ + ASSERT(!clist_empty(&pass1_lru_list)); + pass1_node_free(SKIP_BACK(struct pass1_node, lru_node, clist_head(&pass1_lru_list))); +} + +static inline void +pass1_node_after_move(struct pass1_node *node, addr_int_t move) +{ + //DBG("pass1_node_after_mode(%d, %d)", (uns)node->oid, (uns)move); + /* adjust internal pointers */ +#define MOVE(x) x = (byte *)(x) - move + MOVE(node->url); + MOVE(node->image.pixels); + MOVE(node->dup.buf); +#undef MOVE +} + +static inline void +pass1_buf_shrink(void) +{ + DBG("pass1_buf_shrink()"); + pass1_shrinks++; + pass1_buf_free = pass1_buf_size; + pass1_buf_pos = pass1_buf_start; + CLIST_FOR_EACH(void *, p, pass1_buf_list) + { + struct pass1_node *node = SKIP_BACK(struct pass1_node, buf_node, p); + if (node->buf != pass1_buf_pos) + { + memmove(pass1_buf_pos, node->buf, node->buf_size); + pass1_node_after_move(node, node->buf - pass1_buf_pos); + node->buf = pass1_buf_pos; + } + pass1_buf_pos += node->buf_size; + pass1_buf_free -= node->buf_size; + } +} + +static void * +pass1_buf_alloc(uns size) +{ + //DBG("pass1_buf_alloc(%d)", size); + + /* if there is not enough free space at the end of the buffer */ + if (size > pass1_buf_free) + { + /* free some lru nodes */ + //DBG("freeing lru nodes"); + while (size > pass1_buf_size - pass1_buf_used || pass1_buf_used > pass1_buf_size / 2) + { + if (unlikely(clist_empty(&pass1_lru_list))) // FIXME + die("Buffer too small"); + pass1_node_free_lru(); + } + + pass1_buf_shrink(); + } + + /* final allocation */ + void *result = pass1_buf_pos; + pass1_buf_pos += size; + pass1_buf_free -= size; + pass1_buf_used += size; + pass1_alloc_sum += size; + return result; +} + +static struct pass1_node * +pass1_node_new(oid_t oid) +{ + DBG("pass1_node_new(%d)", (uns)oid); + if (pass1_hash_table.hash_count == pass1_max_count) + pass1_node_free_lru(); + struct pass1_node *node = pass1_hash_new(oid); + mp_flush(pass1_pool); + pass1_reads++; + + /* read object */ + struct card_attr ca; + bsetpos(fb_card_attrs, (sh_off_t)oid * sizeof(ca)); /* FIXME: these seeks can be easily removed */ + bread(fb_card_attrs, &ca, sizeof(ca)); + + bsetpos(fb_cards, (sh_off_t)ca.card << CARD_POS_SHIFT); /* FIXME: maybe a presort should handle these random seeks */ + uns buck_len = bgetl(fb_cards) - (LIZARD_COMPRESS_HEADER - 1); + uns buck_type = bgetc(fb_cards) + BUCKET_TYPE_PLAIN; + struct odes *obj = obj_read_bucket(buck2obj, pass1_pool, buck_type, buck_len, fb_cards, NULL); + if (unlikely(!obj)) + die("Failed to read card"); + byte *url = obj_find_aval(obj_find_attr(obj, 'U' + OBJ_ATTR_SON)->son, 'U'); + uns url_len = strlen(url); + + /* decompress thumbnail */ + struct image_obj imo; + imo_init(&imo, pass1_pool, obj); + if (unlikely(!imo_decompress_thumbnail(&imo))) + die("Cannot decompress thumbnail"); + node->image = imo.thumb; + + /* create duplicates comparision object */ + image_dup_init(&node->dup, &node->image, pass1_pool); + + /* copy data */ + //DBG("loaded image %s s=%d d=%d", url, node->image.size, node->dup.buf_size); + node->buf_size = node->image.size + node->dup.buf_size + url_len + 1; + if (node->buf_size) + { + byte *buf = node->buf = pass1_buf_alloc(node->buf_size); + clist_add_tail(&pass1_buf_list, &node->buf_node); +#define COPY(ptr, size) ({ void *_p=buf; uns _size=(size); buf+=_size; memcpy(_p,(ptr),_size); _p; }) + node->url = COPY(url, url_len + 1); + node->image.pixels = COPY(node->image.pixels, node->image.size); + node->dup.buf = COPY(node->dup.buf, node->dup.buf_size); +#undef COPY + } + + /* add to lru list */ + return node; +} + +static inline struct pass1_node * +pass1_node_lock(oid_t oid) +{ + DBG("pass1_node_lock(%d)", (uns)oid); + pass1_lookups++; + struct pass1_node *node = pass1_hash_find(oid); + if (node) + { + clist_remove(&node->lru_node); + return node; + } + else + return pass1_node_new(oid); +} + +static inline void +pass1_node_unlock(struct pass1_node *node) +{ + //DBG("pass1_node_unlock(%d)", (uns)node->oid); + clist_add_tail(&pass1_lru_list, &node->lru_node); +} + +static void +pass1_show_stats(void) +{ + log(L_INFO, "%d count, %Ld lookups, %Ld reads, %Ld pairs, %Ld dups, %Ld shrinks", tree.count, + (long long int)pass1_lookups, (long long int)pass1_reads, + (long long int)pass1_pairs, (long long int)pass1_dups, (long long int)pass1_shrinks); +} + +static void +pass1(void) +{ + log(L_INFO, "Looking for duplicates"); + ASSERT(tree.nodes); + + /* initialization */ + pass1_lookups = pass1_reads = pass1_pairs = pass1_dups = pass1_shrinks = pass1_alloc_sum = 0; + fb_cards = bopen("index/cards", O_RDONLY, 10000); // FIXME + fb_card_attrs = bopen("index/card-attrs", O_RDONLY, sizeof(struct card_attr)); // FIXME + buck2obj = buck2obj_alloc(); + imo_decompress_thumbnails_init(); + clist_init(&pass1_lru_list); + clist_init(&pass1_buf_list); + pass1_hash_init(); + pass1_buf_init(); + pass1_pool = mp_new(1 << 20); + + /* Hilbert sort */ + pass1_hilbert_sort(); + pass1_hilbert_cleanup(); + + /* main loop */ + for (uns i = 0; i < tree.count; ) + { + /* lookup next image */ + oid_t oid = tree.leaves[i].oid; + struct pass1_node *node = pass1_node_lock(oid); + + /* compare with all near images */ + struct image_search search; + image_search_init(&search, &tree, &precords[i]->vec, pass1_search_dist); + /* FIXME: can be faster than general search in KD-tree */ + oid_t oid2; + uns dist; + for (uns j = 0; j < pass1_search_count && image_search_next(&search, &oid2, &dist); j++) + { + if (oid < oid2) + { + struct pass1_node *node2 = pass1_node_lock(oid2); + DBG("comparing %d and %d", oid, oid2); + if (image_dup_compare(&node->dup, &node2->dup, IMAGE_DUP_TRANS_ID)) + { + pass1_dups++; + log(L_DEBUG, "*** Found duplicates oid1=0x%x oid=0x%x", (uns)node->oid, (uns)node2->oid); + log(L_DEBUG, " %s", node->url); + log(L_DEBUG, " %s", node2->url); + } + pass1_pairs++; + pass1_node_unlock(node2); + } + } + image_search_done(&search); + pass1_node_unlock(node); + i++; + if (i % 1000 == 0) + log(L_DEBUG, "... passed %d images", i); + } + + /* clean up */ + pass1_hash_cleanup(); + pass1_buf_cleanup(); + mp_delete(pass1_pool); + bclose(fb_cards); + bclose(fb_card_attrs); + buck2obj_free(buck2obj); + imo_decompress_thumbnails_done(); + + /* print statistics */ + pass1_show_stats(); +} + +/*********************************************************************************/ + +static uns pass2_clusterings_count = 1; + +static void +pass2_estimate_sizes(void) +{ + if (!vectors_count) + return; + log(L_DEBUG, "Reading image sizes"); + + /* FIXME: hack, these reads are not necessary, can be done in previous phases */ + struct fastbuf *fb_cards = index_bopen("cards", O_RDONLY); + struct fastbuf *fb_card_attrs = index_bopen("card-attrs", O_RDONLY); + struct mempool *pool = mp_new(1 << 16); + struct buck2obj_buf *bob = buck2obj_alloc(); + + for (uns i = 0; i < vectors_count; i++) + { + oid_t oid = vectors[i].oid; + struct card_attr ca; + bsetpos(fb_card_attrs, (sh_off_t)oid * sizeof(ca)); + bread(fb_card_attrs, &ca, sizeof(ca)); + bsetpos(fb_cards, (sh_off_t)ca.card << CARD_POS_SHIFT); + uns buck_len = bgetl(fb_cards) - (LIZARD_COMPRESS_HEADER - 1); + uns buck_type = bgetc(fb_cards) + BUCKET_TYPE_PLAIN; + mp_flush(pool); + struct odes *obj = obj_read_bucket(bob, pool, buck_type, buck_len, fb_cards, NULL); + byte *attr = obj_find_aval(obj, 'G'); + ASSERT(attr); + uns image_width, image_height, image_colors, thumb_width, thumb_height; + byte color_space[MAX_ATTR_SIZE]; + sscanf(attr, "%d%d%s%d%d%d", &image_width, &image_height, color_space, &image_colors, &thumb_width, &thumb_height); + vectors[i].temp = image_dup_estimate_size(thumb_width, thumb_height) + + sizeof(struct image) + thumb_width * thumb_height * 3; + } + buck2obj_free(bob); + mp_delete(pool); + bclose(fb_cards); + bclose(fb_card_attrs); +} + +static void +pass2(void) +{ + // FIXME: presorts, much allocated memory when not needed + vectors_read(); + pass2_estimate_sizes(); + random_clusters_init(); + for (uns clustering = 0; clustering < pass2_clusterings_count; clustering++) + { + random_clusters_build(); + // FIXME + // - external sort + // - generate and compare pairs in clusters + } + random_clusters_cleanup(); + vectors_cleanup(); +} +#endif + +/*********************************************************************************/ static char *shortopts = CF_SHORT_OPTS ""; static struct option longopts[] = @@ -326,12 +894,12 @@ usage(byte *msg) exit(1); } - + int main(int argc UNUSED, char **argv) { int opt; - + log_init(argv[0]); while ((opt = cf_getopt(argc, argv, shortopts, longopts, NULL)) >= 0) switch (opt) @@ -342,9 +910,35 @@ main(int argc UNUSED, char **argv) if (optind != argc) usage("Invalid usage"); - generate_signatures(~0U); - //generate_random_signatures(1000000); - build_search_tree(); - + srgb_to_luv_init(); + +#if 0 + while (1) + { + struct mempool *pool = mp_new(1024); + struct fastbuf *fb = bopen("a.jpg", O_RDONLY, 1024); + struct image_io io; + log(L_DEBUG, "opening"); + image_open(&io, fb, pool); + io.format = IMAGE_FORMAT_JPEG; + log(L_DEBUG, "reading"); + int i; + i = image_read(&io); + log(L_DEBUG, "done %d %d %d", i, io.image.width, io.image.height); + for (i = 0; i < 1000000000; i++) + ; + image_close(&io); + mp_delete(pool); + bclose(fb); + } +#endif + +#if 0 + generate_signatures(20000); + build_kd_tree(); + //pass1(); + pass2(); +#endif + return 0; }