1 // FIXME: this file is full of experiments... will be completely different in final version
5 #include "sherlock/sherlock.h"
6 #include "lib/mempool.h"
8 #include "lib/getopt.h"
9 #include "lib/fastbuf.h"
10 #include "lib/chartype.h"
11 #include "sherlock/object.h"
13 #include "lib/unicode.h"
14 #include "sherlock/lizard-fb.h"
15 #include "sherlock/tagged-text.h"
16 #include "charset/charconv.h"
17 #include "charset/unicat.h"
18 #include "charset/fb-charconv.h"
19 #include "indexer/indexer.h"
20 #include "indexer/lexicon.h"
21 #include "indexer/params.h"
22 #include "utils/dumpconfig.h"
23 #include "lang/lang.h"
24 #include "lib/base224.h"
26 #include "lib/clists.h"
28 #include "images/images.h"
29 #include "images/image-obj.h"
30 #include "images/image-sig.h"
31 #include "images/dup-cmp.h"
32 #include "images/kd-tree.h"
33 #include "images/color.h"
39 static struct fastbuf *fb_cards;
40 static struct fastbuf *fb_card_attrs;
41 static struct buck2obj_buf *buck2obj;
43 /* This should happen in gatherer or scanner */
45 generate_signatures(uns limit)
47 fb_cards = index_bopen("cards", O_RDONLY);
48 fb_card_attrs = index_bopen("card-attrs", O_RDONLY);
49 struct fastbuf *fb_signatures = index_bopen("image-sig", O_CREAT | O_WRONLY | O_TRUNC);
51 struct image_signature sig;
52 struct mempool *pool = mp_new(1 << 16);
53 struct buck2obj_buf *bob = buck2obj_alloc();
57 log(L_INFO, "Generating image signatures");
59 log(L_INFO, "Generating at most %d image signatures", limit);
60 bputl(fb_signatures, 0);
61 imo_decompress_thumbnails_init();
63 for (oid_t oid = 0; bread(fb_card_attrs, &ca, sizeof(ca)); oid++)
64 if ((uns)((ca.type_flags >> 4) - 8) < 4)
66 bsetpos(fb_cards, (sh_off_t)ca.card << CARD_POS_SHIFT);
67 uns buck_len = bgetl(fb_cards) - (LIZARD_COMPRESS_HEADER - 1);
68 uns buck_type = bgetc(fb_cards) + BUCKET_TYPE_PLAIN;
70 struct odes *obj = obj_read_bucket(bob, pool, buck_type, buck_len, fb_cards, NULL);
73 die("Failed to read card");
74 if (attr = obj_find_attr(obj, 'N'))
77 byte *url = obj_find_aval(obj_find_attr(obj, 'U' + OBJ_ATTR_SON)->son, 'U');
78 DBG("Reading oid=%d url=%s", oid, url);
81 imo_init(&imo, pool, obj);
82 if (imo_decompress_thumbnail(&imo))
84 if (compute_image_signature(&imo.thumb, &sig))
86 bwrite(fb_signatures, &oid, sizeof(oid));
87 bwrite(fb_signatures, &sig.vec, sizeof(struct image_vector));
88 bputc(fb_signatures, sig.len);
90 bwrite(fb_signatures, sig.reg, sig.len * sizeof(struct image_region));
92 if (count % 10000 == 0)
93 log(L_DEBUG, "... passed %d images", count);
98 DBG("Cannot create signature");
101 DBG("Cannot decompress thumbnail");
104 brewind(fb_signatures);
105 bputl(fb_signatures, count);
106 DBG("%d signatures written", count);
108 imo_decompress_thumbnails_done();
112 bclose(fb_card_attrs);
113 bclose(fb_signatures);
116 /*********************************************************************************/
118 struct vectors_node {
121 struct image_vector vec;
124 static uns vectors_count;
125 static struct vectors_node *vectors;
130 log(L_DEBUG, "Reading signature vectors");
131 struct fastbuf *fb = index_bopen("image-sig", O_RDONLY);
132 vectors_count = bgetl(fb);
135 vectors = xmalloc(vectors_count * sizeof(struct vectors_node));
136 for (uns i = 0; i < vectors_count; i++)
138 bread(fb, &vectors[i].oid, sizeof(oid_t));
139 bread(fb, &vectors[i].vec, sizeof(struct image_vector));
140 bskip(fb, bgetc(fb) * sizeof(struct image_region));
147 vectors_cleanup(void)
149 log(L_DEBUG, "Freeing signature vectors");
154 /*********************************************************************************/
156 static u64 random_clusters_max_size = 500000;
157 static uns random_clusters_max_count = 1000;
159 #define RANDOM_CLUSTERS_SIZE 0x7fffffff
160 #define RANDOM_CLUSTERS_LAST 0x80000000
162 static struct random_clusters_node {
163 struct vectors_node *node;
165 } *random_clusters_temp;
166 static uns random_clusters_count;
168 #define ASORT_PREFIX(x) random_clusters_##x
169 #define ASORT_KEY_TYPE s32
170 #define ASORT_ELT(i) start[i].dot_prod
171 #define ASORT_SWAP(i,j) do { struct random_clusters_node _s = start[i]; start[i] = start[j]; start[j] = _s; } while(0)
172 #define ASORT_EXTRA_ARGS , struct random_clusters_node *start
173 #include "lib/arraysort.h"
176 random_clusters_init(void)
180 log(L_INFO, "Initializing random clusters generator");
181 random_clusters_temp = xmalloc(vectors_count * sizeof(struct random_clusters_node));
182 for (uns i = 0; i < vectors_count; i++)
183 random_clusters_temp[i].node = vectors + i;
187 random_clusters_build(void)
189 random_clusters_count = 0;
193 log(L_INFO, "Generating random clusters for duplicates comparision");
195 for (uns i = 0; i < vectors_count; i++)
196 vectors[i].temp &= RANDOM_CLUSTERS_SIZE;
198 /* Initialize recursion */
201 struct random_clusters_node *start;
202 } stk_top[64], *stk = stk_top + 1;
203 stk->start = random_clusters_temp;
204 stk->count = vectors_count;
207 while (stk != stk_top)
209 /* Split conditions */
213 else if (stk->count > random_clusters_max_count)
217 s64 size = random_clusters_max_size;
218 for (uns i = 0; i < stk->count && size >= 0; i++)
219 size -= stk->start[i].node->temp;
226 stk->start[stk->count - 1].node->temp |= RANDOM_CLUSTERS_LAST;
227 random_clusters_count++;
231 /* BSP internal node */
234 /* Generate random normal vector of the splitting plane */
235 int normal[IMAGE_VEC_K];
236 for (uns i = 0; i < IMAGE_VEC_K; i++)
237 normal[i] = random_max(0x20001) - 0x10000;
239 /* Compute dot produts */
240 for (uns i = 0; i < stk->count; i++)
242 stk->start[i].dot_prod = 0;
243 for (uns j = 0; j < IMAGE_VEC_K; j++)
244 stk->start[i].dot_prod += normal[j] * stk->start[i].node->vec.f[j];
247 /* Sort... could be faster, because we only need the median */
248 random_clusters_sort(stk->count, stk->start);
250 /* Split in the middle */
251 stk[1].count = stk[0].count >> 1;
252 stk[0].count -= stk[1].count;
253 stk[1].start = stk[0].start;
254 stk[0].start += stk[1].count;
258 log(L_INFO, "Generated %u clusters", random_clusters_count);
262 random_clusters_cleanup(void)
265 xfree(random_clusters_temp);
268 /*********************************************************************************/
270 // FIXME: use vectors_read()... duplicate code
272 struct signature_record {
274 struct image_vector vec;
277 #define ASORT_PREFIX(x) build_search_tree_##x
278 #define ASORT_KEY_TYPE struct signature_record *
279 #define ASORT_ELT(i) rec[i]
280 #define ASORT_LT(x,y) x->vec.f[dim] < y->vec.f[dim]
281 #define ASORT_EXTRA_ARGS , uns dim, struct signature_record **rec
282 #include "lib/arraysort.h"
285 #define DBG_KD(x...) DBG(x)
287 #define DBG_KD(x...) do{}while(0)
290 static struct image_tree tree;
291 static struct signature_record *records;
292 static struct signature_record **precords;
297 log(L_INFO, "Building KD-tree");
299 struct fastbuf *fb_signatures = index_bopen("image-sig", O_RDONLY);
300 tree.count = bgetl(fb_signatures);
301 ASSERT(tree.count < 0x80000000);
305 bclose(fb_signatures);
306 die("There are no signatures");
310 DBG("Reading %d signatures", tree.count);
311 records = xmalloc(tree.count * sizeof(struct signature_record));
312 precords = xmalloc(tree.count * sizeof(void *));
313 for (uns i = 0; i < tree.count; i++)
315 bread(fb_signatures, &records[i].oid, sizeof(oid_t));
316 bread(fb_signatures, &records[i].vec, sizeof(struct image_vector));
317 uns len = bgetc(fb_signatures);
318 bskip(fb_signatures, len * sizeof(struct image_region));
319 precords[i] = records + i;
321 for (uns j = 0; j < IMAGE_VEC_K; j++)
323 tree.bbox.vec[0].f[j] = MIN(tree.bbox.vec[0].f[j], records[i].vec.f[j]);
324 tree.bbox.vec[1].f[j] = MAX(tree.bbox.vec[1].f[j], records[i].vec.f[j]);
327 tree.bbox.vec[0] = tree.bbox.vec[1] = records[0].vec;
329 bclose(fb_signatures);
331 for (tree.depth = 1; (uns)(2 << tree.depth) < tree.count; tree.depth++);
332 DBG("depth=%d nodes=%d bbox=[(%s), (%s)]", tree.depth, 1 << tree.depth,
333 stk_print_image_vector(tree.bbox.vec + 0), stk_print_image_vector(tree.bbox.vec + 1));
334 uns leaves_index = 1 << (tree.depth - 1);
335 tree.nodes = xmalloc_zero((1 << tree.depth) * sizeof(struct image_node));
336 tree.leaves = xmalloc_zero(tree.count * sizeof(struct image_leaf));
338 /* Initialize recursion */
340 struct image_bbox bbox;
342 struct signature_record **start;
343 } stk_top[32], *stk = stk_top + 1;
345 stk->start = precords;
346 stk->count = tree.count;
347 stk->bbox.vec[0] = tree.bbox.vec[0];
348 for (uns i = 0; i < IMAGE_VEC_K; i++)
349 stk->bbox.vec[1].f[i] = tree.bbox.vec[1].f[i] - tree.bbox.vec[0].f[i];
353 while (stk != stk_top)
355 DBG_KD("Main loop... depth=%d index=%d count=%d, start=%d, min=%s dif=%s",
356 stk - stk_top, stk->index, stk->count, stk->start - precords,
357 stk_print_image_vector(stk->bbox.vec + 0), stk_print_image_vector(stk->bbox.vec + 1));
360 /* Create leaf node */
361 if (stk->index >= leaves_index || stk->count < 2)
363 tree.nodes[stk->index].val = IMAGE_NODE_LEAF | entry_index;
364 for (; stk->count--; stk->start++)
366 struct image_leaf *leaf = &tree.leaves[entry_index++];
367 struct signature_record *record = *stk->start;
368 leaf->oid = record->oid;
370 for (uns i = IMAGE_VEC_K; i--; )
372 uns bits = IMAGE_LEAF_BITS(i);
373 leaf->flags <<= bits;
374 if (stk->bbox.vec[1].f[i])
377 (record->vec.f[i] - stk->bbox.vec[0].f[i]) *
378 ((1 << bits) - 1) / stk->bbox.vec[1].f[i];
379 ASSERT(value < (uns)(1 << bits));
380 leaf->flags |= value;
384 leaf->flags |= IMAGE_LEAF_LAST;
385 DBG_KD("Creating leaf node; oid=%d vec=(%s) flags=0x%08x",
386 leaf->oid, stk_print_image_vector(&record->vec), leaf->flags);
391 /* Create internal node */
394 /* Select dimension to splis */
396 for (uns i = 1; i < IMAGE_VEC_K; i++)
397 if (stk->bbox.vec[1].f[i] > stk->bbox.vec[1].f[dim])
400 /* Sort... FIXME: we only need the median */
401 build_search_tree_sort(stk->count, dim, stk->start);
403 /* Split in the middle */
404 uns index = stk->index;
405 stk[1].index = stk[0].index * 2;
406 stk[0].index = stk[1].index + 1;
407 stk[1].count = stk[0].count >> 1;
408 stk[0].count -= stk[1].count;
409 stk[1].start = stk[0].start;
410 stk[0].start += stk[1].count;
412 /* Choose split value */
413 uns lval = stk->start[-1]->vec.f[dim];
414 uns rval = stk->start[0]->vec.f[dim];
415 uns pivot = stk->bbox.vec[0].f[dim] + (stk->bbox.vec[1].f[dim] >> 1);
418 else if (pivot >= rval)
421 DBG_KD("Created internal node; dim=%d pivot=%d", dim, pivot);
424 stk[1].bbox = stk[0].bbox;
425 stk[1].bbox.vec[1].f[dim] = pivot - stk[0].bbox.vec[0].f[dim];
426 stk[0].bbox.vec[0].f[dim] += stk[1].bbox.vec[1].f[dim];
427 stk[0].bbox.vec[1].f[dim] -= stk[1].bbox.vec[1].f[dim];
429 /* Fill the node structure */
430 tree.nodes[index].val = dim + (pivot << 8);
435 DBG("Tree constructed, saving...");
437 struct fastbuf *fb_tree = index_bopen("image-tree", O_CREAT | O_WRONLY | O_TRUNC);
438 bputl(fb_tree, tree.count);
439 bputl(fb_tree, tree.depth);
440 bwrite(fb_tree, &tree.bbox, sizeof(struct image_bbox));
441 bwrite(fb_tree, tree.nodes + 1, ((1 << tree.depth) - 1) * sizeof(struct image_node));
442 bwrite(fb_tree, tree.leaves, tree.count * sizeof(struct image_leaf));
445 //xfree(tree.leaves);
452 /*********************************************************************************/
454 struct pass1_hilbert {
456 struct image_vector vec;
466 struct image_data image;
467 struct image_dup dup;
470 static uns pass1_buf_size = 400 << 20;
471 static uns pass1_max_count = 100000;
472 static uns pass1_search_dist = 40;
473 static uns pass1_search_count = 500;
475 static struct mempool *pass1_pool;
476 static struct pass1_hilbert *pass1_hilbert_list;
477 static byte *pass1_buf_start;
478 static byte *pass1_buf_pos;
479 static uns pass1_buf_free;
480 static uns pass1_buf_used;
481 static clist pass1_buf_list;
482 static clist pass1_lru_list;
483 static u64 pass1_lookups;
484 static u64 pass1_reads;
485 static u64 pass1_pairs;
486 static u64 pass1_dups;
487 static u64 pass1_shrinks;
488 static u64 pass1_alloc_sum;
490 #define HILBERT_PREFIX(x) pass1_hilbert_##x
491 #define HILBERT_TYPE byte
492 #define HILBERT_ORDER 8
493 #define HILBERT_DIM IMAGE_VEC_K
494 #define HILBERT_WANT_ENCODE
495 #include "images/hilbert.h"
497 #define ASORT_PREFIX(x) pass1_hilbert_sort_##x
498 #define ASORT_KEY_TYPE struct image_vector *
499 #define ASORT_ELT(i) (&pass1_hilbert_list[i].vec)
500 #define ASORT_LT(x,y) (memcmp(x, y, sizeof(*x)) < 0)
501 #define ASORT_SWAP(i,j) do { struct pass1_hilbert _s; \
502 _s = pass1_hilbert_list[i]; \
503 pass1_hilbert_list[i] = pass1_hilbert_list[j]; \
504 pass1_hilbert_list[j] = _s; } while(0)
505 #include "lib/arraysort.h"
508 pass1_hilbert_sort(void)
510 DBG("Computing positions on the Hilbert curve");
511 pass1_hilbert_list = xmalloc(tree.count * sizeof(struct pass1_hilbert));
512 for (uns i = 0; i < tree.count; i++)
514 struct pass1_hilbert *h = pass1_hilbert_list + i;
516 byte vec[IMAGE_VEC_K];
517 pass1_hilbert_encode(vec, precords[i]->vec.f);
518 for (uns j = 0; j < IMAGE_VEC_K; j++)
519 h->vec.f[j] = vec[IMAGE_VEC_K - 1 - j];
521 DBG("Sorting signatures in order of incresing parameters on the Hilbert curve");
522 pass1_hilbert_sort_sort(tree.count);
524 for (uns i = 0; i < tree.count; i++)
528 byte *v1 = precords[pass1_hilbert_list[i - 1].index]->vec.f;
529 byte *v2 = precords[pass1_hilbert_list[i].index]->vec.f;
530 #define SQR(x) ((x)*(x))
532 for (uns j = 0; j < 6; j++)
533 dist += SQR(v1[j] - v2[j]);
534 DBG("dist %d", dist);
536 DBG("index %d", pass1_hilbert_list[i].index);
542 pass1_hilbert_cleanup(void)
544 xfree(pass1_hilbert_list);
547 #define HASH_PREFIX(x) pass1_hash_##x
548 #define HASH_NODE struct pass1_node
549 #define HASH_KEY_ATOMIC oid
550 #define HASH_WANT_CLEANUP
551 #define HASH_WANT_FIND
552 #define HASH_WANT_NEW
553 #define HASH_WANT_REMOVE
554 #include "lib/hashtable.h"
559 //DBG("pass1_buf_init()");
560 pass1_buf_free = pass1_buf_size;
561 pass1_buf_start = pass1_buf_pos = xmalloc(pass1_buf_size);
566 pass1_buf_cleanup(void)
568 //DBG("pass1_buf_cleanup()");
569 xfree(pass1_buf_start);
573 pass1_node_free(struct pass1_node *node)
575 //DBG("pass1_node_free(%d)", (uns)node->oid);
578 pass1_buf_used -= node->buf_size;
579 clist_remove(&node->buf_node);
581 clist_remove(&node->lru_node);
582 pass1_hash_remove(node);
586 pass1_node_free_lru(void)
588 ASSERT(!clist_empty(&pass1_lru_list));
589 pass1_node_free(SKIP_BACK(struct pass1_node, lru_node, clist_head(&pass1_lru_list)));
593 pass1_node_after_move(struct pass1_node *node, addr_int_t move)
595 //DBG("pass1_node_after_mode(%d, %d)", (uns)node->oid, (uns)move);
596 /* adjust internal pointers */
597 #define MOVE(x) x = (byte *)(x) - move
599 MOVE(node->image.pixels);
605 pass1_buf_shrink(void)
607 DBG("pass1_buf_shrink()");
609 pass1_buf_free = pass1_buf_size;
610 pass1_buf_pos = pass1_buf_start;
611 CLIST_FOR_EACH(void *, p, pass1_buf_list)
613 struct pass1_node *node = SKIP_BACK(struct pass1_node, buf_node, p);
614 if (node->buf != pass1_buf_pos)
616 memmove(pass1_buf_pos, node->buf, node->buf_size);
617 pass1_node_after_move(node, node->buf - pass1_buf_pos);
618 node->buf = pass1_buf_pos;
620 pass1_buf_pos += node->buf_size;
621 pass1_buf_free -= node->buf_size;
626 pass1_buf_alloc(uns size)
628 //DBG("pass1_buf_alloc(%d)", size);
630 /* if there is not enough free space at the end of the buffer */
631 if (size > pass1_buf_free)
633 /* free some lru nodes */
634 //DBG("freeing lru nodes");
635 while (size > pass1_buf_size - pass1_buf_used || pass1_buf_used > pass1_buf_size / 2)
637 if (unlikely(clist_empty(&pass1_lru_list))) // FIXME
638 die("Buffer too small");
639 pass1_node_free_lru();
645 /* final allocation */
646 void *result = pass1_buf_pos;
647 pass1_buf_pos += size;
648 pass1_buf_free -= size;
649 pass1_buf_used += size;
650 pass1_alloc_sum += size;
654 static struct pass1_node *
655 pass1_node_new(oid_t oid)
657 DBG("pass1_node_new(%d)", (uns)oid);
658 if (pass1_hash_table.hash_count == pass1_max_count)
659 pass1_node_free_lru();
660 struct pass1_node *node = pass1_hash_new(oid);
661 mp_flush(pass1_pool);
666 bsetpos(fb_card_attrs, (sh_off_t)oid * sizeof(ca)); /* FIXME: these seeks can be easily removed */
667 bread(fb_card_attrs, &ca, sizeof(ca));
669 bsetpos(fb_cards, (sh_off_t)ca.card << CARD_POS_SHIFT); /* FIXME: maybe a presort should handle these random seeks */
670 uns buck_len = bgetl(fb_cards) - (LIZARD_COMPRESS_HEADER - 1);
671 uns buck_type = bgetc(fb_cards) + BUCKET_TYPE_PLAIN;
672 struct odes *obj = obj_read_bucket(buck2obj, pass1_pool, buck_type, buck_len, fb_cards, NULL);
674 die("Failed to read card");
675 byte *url = obj_find_aval(obj_find_attr(obj, 'U' + OBJ_ATTR_SON)->son, 'U');
676 uns url_len = strlen(url);
678 /* decompress thumbnail */
679 struct image_obj imo;
680 imo_init(&imo, pass1_pool, obj);
681 if (unlikely(!imo_decompress_thumbnail(&imo)))
682 die("Cannot decompress thumbnail");
683 node->image = imo.thumb;
685 /* create duplicates comparision object */
686 image_dup_init(&node->dup, &node->image, pass1_pool);
689 //DBG("loaded image %s s=%d d=%d", url, node->image.size, node->dup.buf_size);
690 node->buf_size = node->image.size + node->dup.buf_size + url_len + 1;
693 byte *buf = node->buf = pass1_buf_alloc(node->buf_size);
694 clist_add_tail(&pass1_buf_list, &node->buf_node);
695 #define COPY(ptr, size) ({ void *_p=buf; uns _size=(size); buf+=_size; memcpy(_p,(ptr),_size); _p; })
696 node->url = COPY(url, url_len + 1);
697 node->image.pixels = COPY(node->image.pixels, node->image.size);
698 node->dup.buf = COPY(node->dup.buf, node->dup.buf_size);
702 /* add to lru list */
706 static inline struct pass1_node *
707 pass1_node_lock(oid_t oid)
709 DBG("pass1_node_lock(%d)", (uns)oid);
711 struct pass1_node *node = pass1_hash_find(oid);
714 clist_remove(&node->lru_node);
718 return pass1_node_new(oid);
722 pass1_node_unlock(struct pass1_node *node)
724 //DBG("pass1_node_unlock(%d)", (uns)node->oid);
725 clist_add_tail(&pass1_lru_list, &node->lru_node);
729 pass1_show_stats(void)
731 log(L_INFO, "%d count, %Ld lookups, %Ld reads, %Ld pairs, %Ld dups, %Ld shrinks", tree.count,
732 (long long int)pass1_lookups, (long long int)pass1_reads,
733 (long long int)pass1_pairs, (long long int)pass1_dups, (long long int)pass1_shrinks);
739 log(L_INFO, "Looking for duplicates");
743 pass1_lookups = pass1_reads = pass1_pairs = pass1_dups = pass1_shrinks = pass1_alloc_sum = 0;
744 fb_cards = bopen("index/cards", O_RDONLY, 10000); // FIXME
745 fb_card_attrs = bopen("index/card-attrs", O_RDONLY, sizeof(struct card_attr)); // FIXME
746 buck2obj = buck2obj_alloc();
747 imo_decompress_thumbnails_init();
748 clist_init(&pass1_lru_list);
749 clist_init(&pass1_buf_list);
752 pass1_pool = mp_new(1 << 20);
755 pass1_hilbert_sort();
756 pass1_hilbert_cleanup();
759 for (uns i = 0; i < tree.count; )
761 /* lookup next image */
762 oid_t oid = tree.leaves[i].oid;
763 struct pass1_node *node = pass1_node_lock(oid);
765 /* compare with all near images */
766 struct image_search search;
767 image_search_init(&search, &tree, &precords[i]->vec, pass1_search_dist);
768 /* FIXME: can be faster than general search in KD-tree */
771 for (uns j = 0; j < pass1_search_count && image_search_next(&search, &oid2, &dist); j++)
775 struct pass1_node *node2 = pass1_node_lock(oid2);
776 DBG("comparing %d and %d", oid, oid2);
777 if (image_dup_compare(&node->dup, &node2->dup, IMAGE_DUP_TRANS_ID))
780 log(L_DEBUG, "*** Found duplicates oid1=0x%x oid=0x%x", (uns)node->oid, (uns)node2->oid);
781 log(L_DEBUG, " %s", node->url);
782 log(L_DEBUG, " %s", node2->url);
785 pass1_node_unlock(node2);
788 image_search_done(&search);
789 pass1_node_unlock(node);
792 log(L_DEBUG, "... passed %d images", i);
796 pass1_hash_cleanup();
798 mp_delete(pass1_pool);
800 bclose(fb_card_attrs);
801 buck2obj_free(buck2obj);
802 imo_decompress_thumbnails_done();
804 /* print statistics */
808 /*********************************************************************************/
810 static uns pass2_clusterings_count = 1;
813 pass2_estimate_sizes(void)
817 log(L_DEBUG, "Reading image sizes");
819 /* FIXME: hack, these reads are not necessary, can be done in previous phases */
820 struct fastbuf *fb_cards = index_bopen("cards", O_RDONLY);
821 struct fastbuf *fb_card_attrs = index_bopen("card-attrs", O_RDONLY);
822 struct mempool *pool = mp_new(1 << 16);
823 struct buck2obj_buf *bob = buck2obj_alloc();
825 for (uns i = 0; i < vectors_count; i++)
827 oid_t oid = vectors[i].oid;
829 bsetpos(fb_card_attrs, (sh_off_t)oid * sizeof(ca));
830 bread(fb_card_attrs, &ca, sizeof(ca));
831 bsetpos(fb_cards, (sh_off_t)ca.card << CARD_POS_SHIFT);
832 uns buck_len = bgetl(fb_cards) - (LIZARD_COMPRESS_HEADER - 1);
833 uns buck_type = bgetc(fb_cards) + BUCKET_TYPE_PLAIN;
835 struct odes *obj = obj_read_bucket(bob, pool, buck_type, buck_len, fb_cards, NULL);
836 byte *attr = obj_find_aval(obj, 'G');
838 uns image_width, image_height, image_colors, thumb_width, thumb_height;
839 byte color_space[MAX_ATTR_SIZE];
840 sscanf(attr, "%d%d%s%d%d%d", &image_width, &image_height, color_space, &image_colors, &thumb_width, &thumb_height);
841 vectors[i].temp = image_dup_estimate_size(thumb_width, thumb_height) +
842 sizeof(struct image_data) + thumb_width * thumb_height * 3;
847 bclose(fb_card_attrs);
853 // FIXME: presorts, much allocated memory when not needed
855 pass2_estimate_sizes();
856 random_clusters_init();
857 for (uns clustering = 0; clustering < pass2_clusterings_count; clustering++)
859 random_clusters_build();
862 // - generate and compare pairs in clusters
864 random_clusters_cleanup();
868 /*********************************************************************************/
870 static char *shortopts = CF_SHORT_OPTS "";
871 static struct option longopts[] =
877 static char *help = "\
878 Usage: image-indexer [<options>]\n\
880 Options:\n" CF_USAGE;
896 main(int argc UNUSED, char **argv)
901 while ((opt = cf_getopt(argc, argv, shortopts, longopts, NULL)) >= 0)
905 usage("Invalid option");
908 usage("Invalid usage");
912 generate_signatures(20000);