3 #include "sherlock/sherlock.h"
4 #include "lib/mempool.h"
6 #include "lib/fastbuf.h"
7 #include "lib/chartype.h"
8 #include "sherlock/object.h"
10 #include "lib/unicode.h"
11 #include "sherlock/lizard-fb.h"
12 #include "sherlock/tagged-text.h"
13 #include "charset/charconv.h"
14 #include "charset/unicat.h"
15 #include "charset/fb-charconv.h"
16 #include "indexer/indexer.h"
17 #include "indexer/lexicon.h"
18 #include "indexer/params.h"
19 #include "utils/dumpconfig.h"
20 #include "lang/lang.h"
21 #include "lib/base224.h"
23 #include "lib/clists.h"
25 #include "images/images.h"
26 #include "images/image-obj.h"
27 #include "images/image-sig.h"
28 #include "images/dup-cmp.h"
29 #include "images/kd-tree.h"
30 #include "images/color.h"
36 static struct fastbuf *fb_cards;
37 static struct fastbuf *fb_card_attrs;
38 static struct buck2obj_buf *buck2obj;
40 /* This should happen in gatherer or scanner */
42 generate_signatures(uns limit)
44 fb_cards = index_bopen("cards", O_RDONLY);
45 fb_card_attrs = index_bopen("card-attrs", O_RDONLY);
46 struct fastbuf *fb_signatures = index_bopen("image-sig", O_CREAT | O_WRONLY | O_TRUNC);
48 struct image_signature sig;
49 struct mempool *pool = mp_new(1 << 16);
50 struct buck2obj_buf *bob = buck2obj_alloc();
54 log(L_INFO, "Generating image signatures");
56 log(L_INFO, "Generating at most %d image signatures", limit);
57 bputl(fb_signatures, 0);
58 imo_decompress_thumbnails_init();
60 for (oid_t oid = 0; bread(fb_card_attrs, &ca, sizeof(ca)); oid++)
61 if ((uns)((ca.type_flags >> 4) - 8) < 4)
63 bsetpos(fb_cards, (sh_off_t)ca.card << CARD_POS_SHIFT);
64 uns buck_len = bgetl(fb_cards) - (LIZARD_COMPRESS_HEADER - 1);
65 uns buck_type = bgetc(fb_cards) + BUCKET_TYPE_PLAIN;
67 struct odes *obj = obj_read_bucket(bob, pool, buck_type, buck_len, fb_cards, NULL);
70 die("Failed to read card");
71 if (attr = obj_find_attr(obj, 'N'))
74 byte *url = obj_find_aval(obj_find_attr(obj, 'U' + OBJ_ATTR_SON)->son, 'U');
75 DBG("Reading oid=%d url=%s", oid, url);
78 imo_init(&imo, pool, obj);
79 if (imo_decompress_thumbnail(&imo))
81 if (compute_image_signature(&imo.thumb, &sig))
83 bwrite(fb_signatures, &oid, sizeof(oid));
84 bwrite(fb_signatures, &sig.vec, sizeof(struct image_vector));
85 bputc(fb_signatures, sig.len);
87 bwrite(fb_signatures, sig.reg, sig.len * sizeof(struct image_region));
89 if (count % 10000 == 0)
90 log(L_DEBUG, "... passed %d images", count);
95 DBG("Cannot create signature");
98 DBG("Cannot decompress thumbnail");
101 brewind(fb_signatures);
102 bputl(fb_signatures, count);
103 DBG("%d signatures written", count);
105 imo_decompress_thumbnails_done();
109 bclose(fb_card_attrs);
110 bclose(fb_signatures);
113 struct signature_record {
115 struct image_vector vec;
118 #define ASORT_PREFIX(x) build_search_tree_##x
119 #define ASORT_KEY_TYPE struct signature_record *
120 #define ASORT_ELT(i) rec[i]
121 #define ASORT_LT(x,y) x->vec.f[dim] < y->vec.f[dim]
122 #define ASORT_EXTRA_ARGS , uns dim, struct signature_record **rec
123 #include "lib/arraysort.h"
126 #define DBG_KD(x...) DBG(x)
128 #define DBG_KD(x...) do{}while(0)
131 static struct image_tree tree;
132 static struct signature_record *records;
133 static struct signature_record **precords;
138 log(L_INFO, "Building KD-tree");
140 struct fastbuf *fb_signatures = index_bopen("image-sig", O_RDONLY);
141 tree.count = bgetl(fb_signatures);
142 ASSERT(tree.count < 0x80000000);
146 bclose(fb_signatures);
147 die("There are no signatures");
151 DBG("Reading %d signatures", tree.count);
152 records = xmalloc(tree.count * sizeof(struct signature_record));
153 precords = xmalloc(tree.count * sizeof(void *));
154 for (uns i = 0; i < tree.count; i++)
156 bread(fb_signatures, &records[i].oid, sizeof(oid_t));
157 bread(fb_signatures, &records[i].vec, sizeof(struct image_vector));
158 uns len = bgetc(fb_signatures);
159 bskip(fb_signatures, len * sizeof(struct image_region));
160 precords[i] = records + i;
162 for (uns j = 0; j < IMAGE_VEC_K; j++)
164 tree.bbox.vec[0].f[j] = MIN(tree.bbox.vec[0].f[j], records[i].vec.f[j]);
165 tree.bbox.vec[1].f[j] = MAX(tree.bbox.vec[1].f[j], records[i].vec.f[j]);
168 tree.bbox.vec[0] = tree.bbox.vec[1] = records[0].vec;
170 bclose(fb_signatures);
172 for (tree.depth = 1; (uns)(2 << tree.depth) < tree.count; tree.depth++);
173 DBG("depth=%d nodes=%d bbox=[(%s), (%s)]", tree.depth, 1 << tree.depth,
174 stk_print_image_vector(tree.bbox.vec + 0), stk_print_image_vector(tree.bbox.vec + 1));
175 uns leaves_index = 1 << (tree.depth - 1);
176 tree.nodes = xmalloc_zero((1 << tree.depth) * sizeof(struct image_node));
177 tree.leaves = xmalloc_zero(tree.count * sizeof(struct image_leaf));
179 /* Initialize recursion */
181 struct image_bbox bbox;
183 struct signature_record **start;
184 } stk_top[32], *stk = stk_top + 1;
186 stk->start = precords;
187 stk->count = tree.count;
188 stk->bbox.vec[0] = tree.bbox.vec[0];
189 for (uns i = 0; i < IMAGE_VEC_K; i++)
190 stk->bbox.vec[1].f[i] = tree.bbox.vec[1].f[i] - tree.bbox.vec[0].f[i];
194 while (stk != stk_top)
196 DBG_KD("Main loop... depth=%d index=%d count=%d, start=%d, min=%s dif=%s",
197 stk - stk_top, stk->index, stk->count, stk->start - precords,
198 stk_print_image_vector(stk->bbox.vec + 0), stk_print_image_vector(stk->bbox.vec + 1));
201 /* Create leaf node */
202 if (stk->index >= leaves_index || stk->count < 2)
204 tree.nodes[stk->index].val = IMAGE_NODE_LEAF | entry_index;
205 for (; stk->count--; stk->start++)
207 struct image_leaf *leaf = &tree.leaves[entry_index++];
208 struct signature_record *record = *stk->start;
209 leaf->oid = record->oid;
211 for (uns i = IMAGE_VEC_K; i--; )
213 uns bits = IMAGE_LEAF_BITS(i);
214 leaf->flags <<= bits;
215 if (stk->bbox.vec[1].f[i])
218 (record->vec.f[i] - stk->bbox.vec[0].f[i]) *
219 ((1 << bits) - 1) / stk->bbox.vec[1].f[i];
220 ASSERT(value < (uns)(1 << bits));
221 leaf->flags |= value;
225 leaf->flags |= IMAGE_LEAF_LAST;
226 DBG_KD("Creating leaf node; oid=%d vec=(%s) flags=0x%08x",
227 leaf->oid, stk_print_image_vector(&record->vec), leaf->flags);
232 /* Create internal node */
235 /* Select dimension to splis */
237 for (uns i = 1; i < IMAGE_VEC_K; i++)
238 if (stk->bbox.vec[1].f[i] > stk->bbox.vec[1].f[dim])
241 /* Sort... FIXME: we only need the median */
242 build_search_tree_sort(stk->count, dim, stk->start);
244 /* Split in the middle */
245 uns index = stk->index;
246 stk[1].index = stk[0].index * 2;
247 stk[0].index = stk[1].index + 1;
248 stk[1].count = stk[0].count >> 1;
249 stk[0].count -= stk[1].count;
250 stk[1].start = stk[0].start;
251 stk[0].start += stk[1].count;
253 /* Choose split value */
254 uns lval = stk->start[-1]->vec.f[dim];
255 uns rval = stk->start[0]->vec.f[dim];
256 uns pivot = stk->bbox.vec[0].f[dim] + (stk->bbox.vec[1].f[dim] >> 1);
259 else if (pivot >= rval)
262 DBG_KD("Created internal node; dim=%d pivot=%d", dim, pivot);
265 stk[1].bbox = stk[0].bbox;
266 stk[1].bbox.vec[1].f[dim] = pivot - stk[0].bbox.vec[0].f[dim];
267 stk[0].bbox.vec[0].f[dim] += stk[1].bbox.vec[1].f[dim];
268 stk[0].bbox.vec[1].f[dim] -= stk[1].bbox.vec[1].f[dim];
270 /* Fill the node structure */
271 tree.nodes[index].val = dim + (pivot << 8);
276 DBG("Tree constructed, saving...");
278 struct fastbuf *fb_tree = index_bopen("image-tree", O_CREAT | O_WRONLY | O_TRUNC);
279 bputl(fb_tree, tree.count);
280 bputl(fb_tree, tree.depth);
281 bwrite(fb_tree, &tree.bbox, sizeof(struct image_bbox));
282 bwrite(fb_tree, tree.nodes + 1, ((1 << tree.depth) - 1) * sizeof(struct image_node));
283 bwrite(fb_tree, tree.leaves, tree.count * sizeof(struct image_leaf));
286 //xfree(tree.leaves);
293 /*********************************************************************************/
295 static uns pass1_buf_size = 400 << 20;
296 static uns pass1_max_count = 100000;
297 static uns pass1_search_dist = 40;
298 static uns pass1_search_count = 500;
300 static struct mempool *pass1_pool;
301 static byte *pass1_buf_start;
302 static byte *pass1_buf_pos;
303 static uns pass1_buf_free;
304 static uns pass1_buf_used;
305 static clist pass1_buf_list;
306 static clist pass1_lru_list;
307 static u64 pass1_lookups;
308 static u64 pass1_reads;
309 static u64 pass1_pairs;
310 static u64 pass1_dups;
311 static u64 pass1_shrinks;
321 struct image_dup dup;
324 #define HASH_PREFIX(x) pass1_hash_##x
325 #define HASH_NODE struct pass1_node
326 #define HASH_KEY_ATOMIC oid
327 #define HASH_WANT_CLEANUP
328 #define HASH_WANT_FIND
329 #define HASH_WANT_NEW
330 #define HASH_WANT_REMOVE
331 #include "lib/hashtable.h"
336 //DBG("pass1_buf_init()");
337 pass1_buf_free = pass1_buf_size;
338 pass1_buf_start = pass1_buf_pos = xmalloc(pass1_buf_size);
343 pass1_buf_cleanup(void)
345 //DBG("pass1_buf_cleanup()");
346 xfree(pass1_buf_start);
350 pass1_node_free(struct pass1_node *node)
352 //DBG("pass1_node_free(%d)", (uns)node->oid);
355 pass1_buf_used -= node->buf_size;
356 clist_remove(&node->buf_node);
358 clist_remove(&node->lru_node);
359 pass1_hash_remove(node);
363 pass1_node_free_lru(void)
365 ASSERT(!clist_empty(&pass1_lru_list));
366 pass1_node_free(SKIP_BACK(struct pass1_node, lru_node, clist_head(&pass1_lru_list)));
370 pass1_node_after_move(struct pass1_node *node, addr_int_t move)
372 //DBG("pass1_node_after_mode(%d, %d)", (uns)node->oid, (uns)move);
373 /* adjust internal pointers */
374 #define MOVE(x) x = (byte *)(x) - move
376 MOVE(node->image.pixels);
382 pass1_buf_shrink(void)
384 DBG("pass1_buf_shrink()");
386 pass1_buf_free = pass1_buf_size;
387 pass1_buf_pos = pass1_buf_start;
388 CLIST_FOR_EACH(void *, p, pass1_buf_list)
390 struct pass1_node *node = SKIP_BACK(struct pass1_node, buf_node, p);
391 if (node->buf != pass1_buf_pos)
393 memmove(pass1_buf_pos, node->buf, node->buf_size);
394 pass1_node_after_move(node, node->buf - pass1_buf_pos);
395 node->buf = pass1_buf_pos;
397 pass1_buf_pos += node->buf_size;
398 pass1_buf_free -= node->buf_size;
403 pass1_buf_alloc(uns size)
405 //DBG("pass1_buf_alloc(%d)", size);
407 /* if there is not enough free space at the end of the buffer */
408 if (size > pass1_buf_free)
410 /* free some lru nodes */
411 //DBG("freeing lru nodes");
412 while (size > pass1_buf_size - pass1_buf_used || pass1_buf_used * 2 > pass1_buf_size)
414 if (unlikely(clist_empty(&pass1_lru_list))) // FIXME
415 die("Buffer too small");
416 pass1_node_free_lru();
422 /* final allocation */
423 void *result = pass1_buf_pos;
424 pass1_buf_pos += size;
425 pass1_buf_free -= size;
426 pass1_buf_used += size;
430 static struct pass1_node *
431 pass1_node_new(oid_t oid)
433 DBG("pass1_node_new(%d)", (uns)oid);
434 if (pass1_hash_table.hash_count == pass1_max_count)
435 pass1_node_free_lru();
436 struct pass1_node *node = pass1_hash_new(oid);
437 mp_flush(pass1_pool);
442 bsetpos(fb_card_attrs, (sh_off_t)oid * sizeof(ca)); /* FIXME: these seeks can be easily removed */
443 bread(fb_card_attrs, &ca, sizeof(ca));
445 bsetpos(fb_cards, (sh_off_t)ca.card << CARD_POS_SHIFT); /* FIXME: maybe a presort should handle these random seeks */
446 uns buck_len = bgetl(fb_cards) - (LIZARD_COMPRESS_HEADER - 1);
447 uns buck_type = bgetc(fb_cards) + BUCKET_TYPE_PLAIN;
448 struct odes *obj = obj_read_bucket(buck2obj, pass1_pool, buck_type, buck_len, fb_cards, NULL);
450 die("Failed to read card");
451 byte *url = obj_find_aval(obj_find_attr(obj, 'U' + OBJ_ATTR_SON)->son, 'U');
452 uns url_len = strlen(url);
454 /* decompress thumbnail */
455 struct image_obj imo;
456 imo_init(&imo, pass1_pool, obj);
457 if (unlikely(!imo_decompress_thumbnail(&imo)))
458 die("Cannot decompress thumbnail");
459 node->image = imo.thumb;
461 /* create duplicates comparision object */
462 image_dup_init(&node->dup, &node->image, pass1_pool);
465 //DBG("loaded image %s s=%d d=%d", url, node->image.size, node->dup.buf_size);
466 node->buf_size = node->image.size + node->dup.buf_size + url_len + 1;
469 byte *buf = node->buf = pass1_buf_alloc(node->buf_size);
470 clist_add_tail(&pass1_buf_list, &node->buf_node);
471 #define COPY(ptr, size) ({ void *_p=buf; uns _size=(size); buf+=_size; memcpy(_p,(ptr),_size); _p; })
472 node->url = COPY(url, url_len + 1);
473 node->image.pixels = COPY(node->image.pixels, node->image.size);
474 node->dup.buf = COPY(node->dup.buf, node->dup.buf_size);
478 /* add to lru list */
482 static inline struct pass1_node *
483 pass1_node_lock(oid_t oid)
485 DBG("pass1_node_lock(%d)", (uns)oid);
487 struct pass1_node *node = pass1_hash_find(oid);
490 clist_remove(&node->lru_node);
494 return pass1_node_new(oid);
498 pass1_node_unlock(struct pass1_node *node)
500 //DBG("pass1_node_unlock(%d)", (uns)node->oid);
501 clist_add_tail(&pass1_lru_list, &node->lru_node);
507 log(L_INFO, "Looking for duplicates");
511 pass1_lookups = pass1_reads = pass1_pairs = pass1_dups = pass1_shrinks = 0;
512 fb_cards = bopen("index/cards", O_RDONLY, 10000); // FIXME
513 fb_card_attrs = bopen("index/card-attrs", O_RDONLY, sizeof(struct card_attr)); // FIXME
514 buck2obj = buck2obj_alloc();
515 imo_decompress_thumbnails_init();
516 clist_init(&pass1_lru_list);
517 clist_init(&pass1_buf_list);
520 pass1_pool = mp_new(1 << 20);
523 for (uns i = 0; i < tree.count; )
525 /* lookup next image */
526 oid_t oid = tree.leaves[i].oid;
527 struct pass1_node *node = pass1_node_lock(oid);
529 /* compare with all near images */
530 struct image_search search;
531 image_search_init(&search, &tree, &precords[i]->vec, pass1_search_dist);
532 /* FIXME: can be faster than general search in KD-tree */
535 for (uns j = 0; j < pass1_search_count && image_search_next(&search, &oid2, &dist); j++)
539 struct pass1_node *node2 = pass1_node_lock(oid2);
540 DBG("comparing %d and %d", oid, oid2);
541 if (image_dup_compare(&node->dup, &node2->dup, IMAGE_DUP_TRANS_ALL))
544 log(L_DEBUG, "*** Found duplicates oid1=0x%x oid=0x%x", (uns)node->oid, (uns)node2->oid);
545 log(L_DEBUG, " %s", node->url);
546 log(L_DEBUG, " %s", node2->url);
549 pass1_node_unlock(node2);
552 image_search_done(&search);
553 pass1_node_unlock(node);
556 log(L_DEBUG, "... passed %d images", i);
560 pass1_hash_cleanup();
562 mp_delete(pass1_pool);
564 bclose(fb_card_attrs);
565 buck2obj_free(buck2obj);
566 imo_decompress_thumbnails_done();
568 /* print statistics */
569 log(L_INFO, "%d count, %Ld lookups, %Ld reads, %Ld pairs, %Ld dups, %Ld shrinks", tree.count,
570 (long long int)pass1_lookups, (long long int)pass1_reads,
571 (long long int)pass1_pairs, (long long int)pass1_dups, (long long int)pass1_shrinks);
574 /*********************************************************************************/
576 static char *shortopts = CF_SHORT_OPTS "";
577 static struct option longopts[] =
583 static char *help = "\
584 Usage: image-indexer [<options>]\n\
586 Options:\n" CF_USAGE;
602 main(int argc UNUSED, char **argv)
607 while ((opt = cf_getopt(argc, argv, shortopts, longopts, NULL)) >= 0)
611 usage("Invalid option");
614 usage("Invalid usage");
618 generate_signatures(20000);