]> mj.ucw.cz Git - libucw.git/blob - images/image-idx.c
058c7774e92c4d0c4b7db44a4fcb8ac9a2db78d0
[libucw.git] / images / image-idx.c
1 #define LOCAL_DEBUG
2
3 #include "sherlock/sherlock.h"
4 #include "lib/mempool.h"
5 #include "lib/conf.h"
6 #include "lib/fastbuf.h"
7 #include "lib/chartype.h"
8 #include "sherlock/object.h"
9 #include "lib/url.h"
10 #include "lib/unicode.h"
11 #include "sherlock/lizard-fb.h"
12 #include "sherlock/tagged-text.h"
13 #include "charset/charconv.h"
14 #include "charset/unicat.h"
15 #include "charset/fb-charconv.h"
16 #include "indexer/indexer.h"
17 #include "indexer/lexicon.h"
18 #include "indexer/params.h"
19 #include "utils/dumpconfig.h"
20 #include "lang/lang.h"
21 #include "lib/base224.h"
22 #include "lib/bbuf.h"
23
24 #include "images/images.h"
25 #include "images/image-thumb.h"
26
27 #include <stdlib.h>
28 #include <fcntl.h>
29 #include <string.h>
30
31 /* This should happen in gatherer or scanner */
32 UNUSED static void
33 generate_signatures(uns limit)
34 {
35   struct fastbuf *fb_cards = index_bopen("cards", O_RDONLY);
36   struct fastbuf *fb_card_attrs = index_bopen("card-attrs", O_RDONLY);
37   struct fastbuf *fb_signatures = index_bopen("image-sig", O_CREAT | O_WRONLY | O_TRUNC);
38   struct card_attr ca;
39   struct image_signature sig;
40   struct mempool *pool = mp_new(1 << 16);
41   struct buck2obj_buf *bob = buck2obj_alloc();
42   uns count = 0;
43
44   log(L_INFO, "Generating image signatures");
45   bputl(fb_signatures, 0);
46   compute_image_signature_prepare();
47
48   for (oid_t oid = 0; bread(fb_card_attrs, &ca, sizeof(ca)); oid++)
49     if ((uns)((ca.type_flags >> 4) - 8) < 4)
50       {
51         bsetpos(fb_cards, (sh_off_t)ca.card << CARD_POS_SHIFT);
52         uns buck_len = bgetl(fb_cards) - (LIZARD_COMPRESS_HEADER - 1);
53         uns buck_type = bgetc(fb_cards) + BUCKET_TYPE_PLAIN;
54         mp_flush(pool);
55         struct odes *obj = obj_read_bucket(bob, pool, buck_type, buck_len, fb_cards, NULL);
56         struct oattr *attr;
57         if (!obj)
58           die("Failed to read card");
59         if (attr = obj_find_attr(obj, 'N'))
60           {
61             DBG("Reading oid=%d url=%s", oid, obj_find_aval(obj_find_attr(obj, 'U' + OBJ_ATTR_SON)->son, 'U'));
62             /*bb_t buf;
63             uns buf_len = 0;
64             bb_init(&buf);
65             for (; attr; attr = attr->same)
66               {
67                 uns len = strlen(attr->val);
68                 bb_grow(&buf, buf_len + len);
69                 memcpy(buf.ptr + buf_len, attr->val, len);
70                 buf_len += len;
71               }
72             byte thumb[buf_len];
73             uns thumb_len = base224_decode(thumb, buf.ptr, buf_len);*/
74             struct image thumb;
75             int err;
76             if (!(err = decompress_thumbnail(obj, pool, &thumb)))
77               {
78                 if (!(err = compute_image_signature(&thumb, &sig)))
79                   {
80                     bwrite(fb_signatures, &oid, sizeof(oid));
81                     bwrite(fb_signatures, &sig.vec, sizeof(struct image_vector));
82                     bputc(fb_signatures, sig.len);
83                     if (sig.len)
84                       bwrite(fb_signatures, sig.reg, sig.len * sizeof(struct image_region));
85                     count++;
86                     if (count >= limit)
87                       break;
88                   }
89                 else
90                   DBG("Cannot create signature, error=%d", err);
91               }
92             else
93               DBG("Cannot decompress thumbnail, error=%d", err);
94           }
95       }
96   brewind(fb_signatures);
97   bputl(fb_signatures, count);
98   DBG("%d signatures written", count);
99
100   compute_image_signature_finish();
101   buck2obj_free(bob);
102   mp_delete(pool);
103   bclose(fb_cards);
104   bclose(fb_card_attrs);
105   bclose(fb_signatures);
106 }
107
108 UNUSED static void
109 generate_random_signatures(uns count)
110 {
111   log(L_INFO, "Generating %d random signatures", count);
112   struct fastbuf *fb_signatures = index_bopen("image-sig", O_CREAT | O_WRONLY | O_TRUNC);
113   bputl(fb_signatures, count);
114   for (uns i = 0; i < count; i++)
115     {
116       oid_t oid = i;
117       struct image_vector vec;
118       for (uns j = 0; j < IMAGE_VEC_K; j++)
119         vec.f[j] = random_max(256); 
120       bwrite(fb_signatures, &oid, sizeof(oid));
121       bwrite(fb_signatures, &vec, sizeof(vec));
122       bputc(fb_signatures, 0);
123     }
124   bclose(fb_signatures); 
125 }
126
127 struct signature_record {
128   oid_t oid;
129   struct image_vector vec;
130 };
131
132 #define ASORT_PREFIX(x) build_search_tree_##x
133 #define ASORT_KEY_TYPE struct signature_record *
134 #define ASORT_ELT(i) rec[i]
135 #define ASORT_LT(x,y) x->vec.f[dim] < y->vec.f[dim]
136 #define ASORT_EXTRA_ARGS , uns dim, struct signature_record **rec
137 #include "lib/arraysort.h"
138
139 #if 0
140 #define DBG_KD(x...) DBG(x)
141 #else
142 #define DBG_KD(x...) do{}while(0)
143 #endif
144
145 static void
146 build_search_tree(void)
147 {
148   log(L_INFO, "Building KD-tree");
149
150   struct fastbuf *fb_signatures = index_bopen("image-sig", O_RDONLY);
151   struct image_tree tree;
152   tree.count = bgetl(fb_signatures);
153   ASSERT(tree.count < 0x80000000);
154   if (!tree.count)
155     {
156       /* FIXME */
157       bclose(fb_signatures);
158       die("There are no signatures");
159     }
160   else
161     {
162       DBG("Reading %d signatures", tree.count);
163       struct signature_record *records = xmalloc(tree.count * sizeof(struct signature_record));
164       struct signature_record **precords = xmalloc(tree.count * sizeof(void *));
165       for (uns i = 0; i < tree.count; i++)
166         {
167           bread(fb_signatures, &records[i].oid, sizeof(oid_t));
168           bread(fb_signatures, &records[i].vec, sizeof(struct image_vector));
169           uns len = bgetc(fb_signatures);
170           bskip(fb_signatures, len * sizeof(struct image_region));
171           precords[i] = records + i;
172           if (likely(i))
173             for (uns j = 0; j < IMAGE_VEC_K; j++)
174               {
175                 tree.bbox.vec[0].f[j] = MIN(tree.bbox.vec[0].f[j], records[i].vec.f[j]);
176                 tree.bbox.vec[1].f[j] = MAX(tree.bbox.vec[1].f[j], records[i].vec.f[j]);
177               }
178           else
179             tree.bbox.vec[0] = tree.bbox.vec[1] = records[0].vec;
180         }
181       bclose(fb_signatures);
182       
183       for (tree.depth = 1; (uns)(2 << tree.depth) < tree.count; tree.depth++);
184       DBG("depth=%d nodes=%d bbox=[(%s), (%s)]", tree.depth, 1 << tree.depth, 
185           stk_print_image_vector(tree.bbox.vec + 0), stk_print_image_vector(tree.bbox.vec + 1));
186       uns leaves_index = 1 << (tree.depth - 1);
187       tree.nodes = xmalloc_zero((1 << tree.depth) * sizeof(struct image_node));
188       tree.leaves = xmalloc_zero(tree.count * sizeof(struct image_leaf));
189      
190       /* Initialize recursion */
191       struct stk {
192         struct image_bbox bbox;
193         uns index, count;
194         struct signature_record **start;
195       } stk_top[32], *stk = stk_top + 1;
196       stk->index = 1;
197       stk->start = precords;
198       stk->count = tree.count;
199       stk->bbox.vec[0] = tree.bbox.vec[0];
200       for (uns i = 0; i < IMAGE_VEC_K; i++)
201         stk->bbox.vec[1].f[i] = tree.bbox.vec[1].f[i] - tree.bbox.vec[0].f[i];
202       uns entry_index = 0;
203     
204       /* Main loop */
205       while (stk != stk_top)
206         {
207           DBG_KD("Main loop... depth=%d index=%d count=%d, start=%d, min=%s dif=%s",
208               stk - stk_top, stk->index, stk->count, stk->start - precords,
209               stk_print_image_vector(stk->bbox.vec + 0), stk_print_image_vector(stk->bbox.vec + 1));
210           ASSERT(stk->count);
211           
212           /* Create leaf node */
213           if (stk->index >= leaves_index || stk->count < 2)
214             {
215               tree.nodes[stk->index].val = IMAGE_NODE_LEAF | entry_index;
216               for (; stk->count--; stk->start++)
217                 {
218                   struct image_leaf *leaf = &tree.leaves[entry_index++];
219                   struct signature_record *record = *stk->start;
220                   leaf->oid = record->oid;
221                   leaf->flags = 0;
222                   for (uns i = IMAGE_VEC_K; i--; )
223                     {
224                       uns bits = IMAGE_LEAF_BITS(i);
225                       leaf->flags <<= bits;
226                       if (stk->bbox.vec[1].f[i])
227                         {
228                           uns value =
229                             (record->vec.f[i] - stk->bbox.vec[0].f[i]) * 
230                             ((1 << bits) - 1) / stk->bbox.vec[1].f[i];
231                           ASSERT(value < (uns)(1 << bits));
232                           leaf->flags |= value;
233                         }
234                     }
235                   if (!stk->count)
236                     leaf->flags |= IMAGE_LEAF_LAST; 
237                   DBG_KD("Creating leaf node; oid=%d vec=(%s) flags=0x%08x", 
238                       leaf->oid, stk_print_image_vector(&record->vec), leaf->flags);
239                 }
240               stk--;
241             }
242          
243           /* Create internal node */
244           else
245             {
246               /* Select dimension to splis */
247               uns dim = 0;
248               for (uns i = 1; i < IMAGE_VEC_K; i++)
249                 if (stk->bbox.vec[1].f[i] > stk->bbox.vec[1].f[dim])
250                   dim = i;
251
252               /* Sort... FIXME: we only need the median */
253               build_search_tree_sort(stk->count, dim, stk->start);
254               
255               /* Split in the middle */
256               uns index = stk->index;
257               stk[1].index = stk[0].index * 2;
258               stk[0].index = stk[1].index + 1;
259               stk[1].count = stk[0].count >> 1;
260               stk[0].count -= stk[1].count;
261               stk[1].start = stk[0].start;
262               stk[0].start += stk[1].count;
263
264               /* Choose split value */
265               uns lval = stk->start[-1]->vec.f[dim];
266               uns rval = stk->start[0]->vec.f[dim];
267               uns pivot = stk->bbox.vec[0].f[dim] + (stk->bbox.vec[1].f[dim] >> 1);
268               if (pivot <= lval)
269                 pivot = lval;
270               else if (pivot >= rval)
271                 pivot = rval;
272
273               DBG_KD("Created internal node; dim=%d pivot=%d", dim, pivot);
274
275               /* Split the box */
276               stk[1].bbox = stk[0].bbox;
277               stk[1].bbox.vec[1].f[dim] = pivot - stk[0].bbox.vec[0].f[dim];
278               stk[0].bbox.vec[0].f[dim] += stk[1].bbox.vec[1].f[dim];
279               stk[0].bbox.vec[1].f[dim] -= stk[1].bbox.vec[1].f[dim];
280
281               /* Fill the node structure */
282               tree.nodes[index].val = dim + (pivot << 8);
283               stk++;
284             }
285         }
286
287       DBG("Tree constructed, saving...");
288
289       struct fastbuf *fb_tree = index_bopen("image-tree", O_CREAT | O_WRONLY | O_TRUNC);
290       bputl(fb_tree, tree.count);
291       bputl(fb_tree, tree.depth);
292       bwrite(fb_tree, &tree.bbox, sizeof(struct image_bbox));
293       bwrite(fb_tree, tree.nodes + 1, ((1 << tree.depth) - 1) * sizeof(struct image_node));
294       bwrite(fb_tree, tree.leaves, tree.count * sizeof(struct image_leaf));
295       bclose(fb_tree);
296
297       xfree(tree.leaves);
298       xfree(tree.nodes);
299       xfree(precords);
300       xfree(records);
301     }
302 }
303
304
305 static char *shortopts = CF_SHORT_OPTS "";
306 static struct option longopts[] =
307 {
308   CF_LONG_OPTS
309   { NULL, 0, 0, 0 }
310 };
311
312 static char *help = "\
313 Usage: image-indexer [<options>]\n\
314 \n\
315 Options:\n" CF_USAGE;
316
317 static void NONRET
318 usage(byte *msg)
319 {
320   if (msg)
321   {
322     fputs(msg, stderr);
323     fputc('\n', stderr);
324   }
325   fputs(help, stderr);
326   exit(1);
327 }
328
329   
330 int
331 main(int argc UNUSED, char **argv)
332 {
333   int opt;
334   
335   log_init(argv[0]);
336   while ((opt = cf_getopt(argc, argv, shortopts, longopts, NULL)) >= 0)
337     switch (opt)
338     {
339       default:
340       usage("Invalid option");
341     }
342   if (optind != argc)
343     usage("Invalid usage");
344
345   generate_signatures(~0U);
346   //generate_random_signatures(1000000);
347   build_search_tree();
348   
349   return 0;
350 }