]> mj.ucw.cz Git - libucw.git/blob - images/image-idx.c
Experimental version of search tree construction...
[libucw.git] / images / image-idx.c
1 #define LOCAL_DEBUG
2
3 #include "sherlock/sherlock.h"
4 #include "lib/mempool.h"
5 #include "lib/conf.h"
6 #include "lib/fastbuf.h"
7 #include "lib/chartype.h"
8 #include "sherlock/object.h"
9 #include "lib/url.h"
10 #include "lib/unicode.h"
11 #include "sherlock/lizard-fb.h"
12 #include "sherlock/tagged-text.h"
13 #include "charset/charconv.h"
14 #include "charset/unicat.h"
15 #include "charset/fb-charconv.h"
16 #include "indexer/indexer.h"
17 #include "indexer/lexicon.h"
18 #include "indexer/params.h"
19 #include "utils/dumpconfig.h"
20 #include "lang/lang.h"
21 #include "lib/base224.h"
22 #include "lib/bbuf.h"
23
24 #include "images/images.h"
25
26 #include <stdlib.h>
27 #include <fcntl.h>
28 #include <string.h>
29
30 /* This should happen in gatherer or scanner */
31 static void
32 generate_signatures(uns limit)
33 {
34   struct fastbuf *fb_cards = index_bopen("cards", O_RDONLY);
35   struct fastbuf *fb_card_attrs = index_bopen("card-attrs", O_RDONLY);
36   struct fastbuf *fb_signatures = index_bopen("image-sig", O_CREAT | O_WRONLY | O_TRUNC);
37   struct card_attr ca;
38   struct image_signature sig;
39   struct mempool *pool = mp_new(1 << 16);
40   struct buck2obj_buf *bob = buck2obj_alloc();
41   uns count = 0;
42
43   log(L_INFO, "Generating image signatures");
44   bputl(fb_signatures, 0);
45   compute_image_signature_prepare();
46
47   for (oid_t oid = 0; bread(fb_card_attrs, &ca, sizeof(ca)); oid++)
48     if ((uns)((ca.type_flags >> 4) - 8) < 4)
49       {
50         bsetpos(fb_cards, (sh_off_t)ca.card << CARD_POS_SHIFT);
51         uns buck_len = bgetl(fb_cards) - (LIZARD_COMPRESS_HEADER - 1);
52         uns buck_type = bgetc(fb_cards) + BUCKET_TYPE_PLAIN;
53         mp_flush(pool);
54         struct odes *obj = obj_read_bucket(bob, pool, buck_type, buck_len, fb_cards, NULL);
55         struct oattr *attr;
56         if (!obj)
57           die("Failed to read card");
58         if (attr = obj_find_attr(obj, 'N'))
59           {
60             DBG("Reading oid=%d url=%s", oid, obj_find_aval(obj_find_attr(obj, 'U' + OBJ_ATTR_SON)->son, 'U'));
61             bb_t buf;
62             uns buf_len = 0;
63             bb_init(&buf);
64             for (; attr; attr = attr->same)
65               {
66                 uns len = strlen(attr->val);
67                 bb_grow(&buf, buf_len + len);
68                 memcpy(buf.ptr + buf_len, attr->val, len);
69                 buf_len += len;
70               }
71             byte thumb[buf_len];
72             uns thumb_len = base224_decode(thumb, buf.ptr, buf_len);
73            
74             int err = compute_image_signature(thumb, thumb_len, &sig);
75             if (!err)
76               {
77                 bwrite(fb_signatures, &oid, sizeof(oid));
78                 bwrite(fb_signatures, &sig.vec, sizeof(struct image_vector));
79                 bputc(fb_signatures, sig.len);
80                 if (sig.len)
81                   bwrite(fb_signatures, sig.reg, sig.len * sizeof(struct image_region));
82                 count++;
83                 if (count >= limit)
84                   break;
85               }
86             else
87               DBG("Cannot create signature, error=%d", err);
88
89             bb_done(&buf);
90           }
91       }
92   brewind(fb_signatures);
93   bputl(fb_signatures, count);
94   DBG("%d signatures written", count);
95
96   compute_image_signature_finish();
97   buck2obj_free(bob);
98   mp_delete(pool);
99   bclose(fb_cards);
100   bclose(fb_card_attrs);
101   bclose(fb_signatures);
102 }
103
104 struct signature_record {
105   oid_t oid;
106   struct image_vector vec;
107 };
108
109 #define ASORT_PREFIX(x) build_search_tree_##x
110 #define ASORT_KEY_TYPE struct signature_record *
111 #define ASORT_ELT(i) rec[i]
112 #define ASORT_LT(x,y) x->vec.f[dim] < y->vec.f[dim]
113 #define ASORT_EXTRA_ARGS , uns dim, struct signature_record **rec
114 #include "lib/arraysort.h"
115
116 #if 0
117 #define DBG_KD(x...) DBG(x)
118 #else
119 #define DBG_KD(x...) do{}while(0)
120 #endif
121
122 static void
123 build_search_tree(void)
124 {
125   log(L_INFO, "Building KD-tree");
126
127   struct fastbuf *fb_signatures = index_bopen("image-sig", O_RDONLY);
128   struct image_tree tree;
129   tree.count = bgetl(fb_signatures);
130   ASSERT(tree.count < 0x80000000);
131   if (!tree.count)
132     {
133       /* FIXME */
134       bclose(fb_signatures);
135       die("There are no signatures");
136     }
137   else
138     {
139       DBG("Reading %d signatures", tree.count);
140       struct signature_record *records = xmalloc(tree.count * sizeof(struct signature_record));
141       struct signature_record **precords = xmalloc(tree.count * sizeof(void *));
142       for (uns i = 0; i < tree.count; i++)
143         {
144           bread(fb_signatures, &records[i].oid, sizeof(oid_t));
145           bread(fb_signatures, &records[i].vec, sizeof(struct image_vector));
146           uns len = bgetc(fb_signatures);
147           bskip(fb_signatures, len * sizeof(struct image_region));
148           precords[i] = records + i;
149           if (likely(i))
150             for (uns j = 0; j < IMAGE_VEC_K; j++)
151               {
152                 tree.bbox.vec[0].f[j] = MIN(tree.bbox.vec[0].f[j], records[i].vec.f[j]);
153                 tree.bbox.vec[1].f[j] = MAX(tree.bbox.vec[1].f[j], records[i].vec.f[j]);
154               }
155           else
156             tree.bbox.vec[0] = tree.bbox.vec[1] = records[0].vec;
157         }
158       bclose(fb_signatures);
159       
160       for (tree.depth = 1; (uns)(2 << tree.depth) < tree.count; tree.depth++);
161       DBG("depth=%d nodes=%d bbox=[(%s), (%s)]", tree.depth, 1 << tree.depth, 
162           stk_print_image_vector(tree.bbox.vec + 0), stk_print_image_vector(tree.bbox.vec + 1));
163       uns leaves_index = 1 << (tree.depth - 1);
164       tree.nodes = xmalloc((1 << tree.depth) * sizeof(struct image_node));
165       tree.leaves = xmalloc(tree.count * sizeof(struct image_leaf));
166      
167       /* Initialize recursion */
168       struct stk {
169         struct image_bbox bbox;
170         uns index, count;
171         struct signature_record **start;
172       } stk_top[32], *stk = stk_top + 1;
173       stk->index = 1;
174       stk->start = precords;
175       stk->count = tree.count;
176       stk->bbox.vec[0] = tree.bbox.vec[0];
177       for (uns i = 0; i < IMAGE_VEC_K; i++)
178         stk->bbox.vec[1].f[i] = tree.bbox.vec[1].f[i] - tree.bbox.vec[0].f[i];
179       uns entry_index = 0;
180     
181       /* Main loop */
182       while (stk != stk_top)
183         {
184           DBG_KD("Main loop... depth=%d index=%d count=%d, start=%d, min=%s dif=%s",
185               stk - stk_top, stk->index, stk->count, stk->start - precords,
186               stk_print_image_vector(stk->bbox.vec + 0), stk_print_image_vector(stk->bbox.vec + 1));
187           ASSERT(stk->count);
188           
189           /* Create leaf node */
190           if (stk->index >= leaves_index || stk->count < 2)
191             {
192               tree.nodes[stk->index].val = IMAGE_NODE_LEAF | entry_index;
193               for (; stk->count--; stk->start++)
194                 {
195                   struct image_leaf *leaf = &tree.leaves[entry_index++];
196                   struct signature_record *record = *stk->start;
197                   leaf->oid = record->oid;
198                   leaf->flags = 0;
199                   for (uns i = IMAGE_VEC_K; i--; )
200                     {
201                       uns bits = IMAGE_LEAF_BITS(i);
202                       leaf->flags <<= bits;
203                       if (stk->bbox.vec[1].f[i])
204                         {
205                           uns value =
206                             (record->vec.f[i] - stk->bbox.vec[0].f[i]) * 
207                             ((1 << bits) - 1) / stk->bbox.vec[1].f[i];
208                           ASSERT(value < (1 << bits));
209                           leaf->flags |= value;
210                         }
211                     }
212                   if (!stk->count)
213                     leaf->flags |= IMAGE_LEAF_LAST; 
214                   DBG_KD("Creating leaf node; oid=%d vec=(%s) flags=0x%08x", 
215                       leaf->oid, stk_print_image_vector(&record->vec), leaf->flags);
216                 }
217               stk--;
218             }
219          
220           /* Create internal node */
221           else
222             {
223               /* Select dimension to splis */
224               uns dim = 0;
225               for (uns i = 1; i < IMAGE_VEC_K; i++)
226                 if (stk->bbox.vec[1].f[i] > stk->bbox.vec[1].f[dim])
227                   dim = i;
228
229               /* Sort... FIXME: we only need the median */
230               build_search_tree_sort(stk->count, dim, stk->start);
231               
232               /* Split in the middle */
233               uns index = stk->index;
234               stk[1].index = stk[0].index * 2;
235               stk[0].index = stk[1].index + 1;
236               stk[1].count = stk[0].count >> 1;
237               stk[0].count -= stk[1].count;
238               stk[1].start = stk[0].start;
239               stk[0].start += stk[1].count;
240
241               /* Choose split value */
242               uns lval = stk->start[-1]->vec.f[dim];
243               uns rval = stk->start[0]->vec.f[dim];
244               uns pivot = stk->bbox.vec[0].f[dim] + (stk->bbox.vec[1].f[dim] >> 1);
245               if (pivot <= lval)
246                 pivot = lval;
247               else if (pivot >= rval)
248                 pivot = rval;
249
250               DBG_KD("Created internal node; dim=%d pivot=%d", dim, pivot);
251
252               /* Split the box */
253               stk[1].bbox = stk[0].bbox;
254               stk[1].bbox.vec[1].f[dim] = pivot - stk[0].bbox.vec[0].f[dim];
255               stk[0].bbox.vec[0].f[dim] += stk[1].bbox.vec[1].f[dim];
256               stk[0].bbox.vec[1].f[dim] -= stk[1].bbox.vec[1].f[dim];
257
258               /* Fill the node structure */
259               tree.nodes[index].val = dim + (pivot << 8);
260               stk++;
261             }
262         }
263
264       DBG("Tree constructed, saving...");
265
266       struct fastbuf *fb_tree = index_bopen("image-tree", O_CREAT | O_WRONLY | O_TRUNC);
267       bputl(fb_tree, tree.count);
268       bputl(fb_tree, tree.depth);
269       bwrite(fb_tree, tree.nodes + 1, ((1 << tree.depth) - 1) * sizeof(struct image_node));
270       bwrite(fb_tree, tree.leaves, tree.count * sizeof(struct image_leaf));
271       bclose(fb_tree);
272
273       xfree(tree.leaves);
274       xfree(tree.nodes);
275       xfree(precords);
276       xfree(records);
277     }
278 }
279
280
281 static char *shortopts = CF_SHORT_OPTS "";
282 static struct option longopts[] =
283 {
284   CF_LONG_OPTS
285   { NULL, 0, 0, 0 }
286 };
287
288 static char *help = "\
289 Usage: image-indexer [<options>]\n\
290 \n\
291 Options:\n" CF_USAGE;
292
293 static void NONRET
294 usage(byte *msg)
295 {
296   if (msg)
297   {
298     fputs(msg, stderr);
299     fputc('\n', stderr);
300   }
301   fputs(help, stderr);
302   exit(1);
303 }
304
305   
306 int
307 main(int argc UNUSED, char **argv)
308 {
309   int opt;
310   
311   log_init(argv[0]);
312   while ((opt = cf_getopt(argc, argv, shortopts, longopts, NULL)) >= 0)
313     switch (opt)
314     {
315       default:
316       usage("Invalid option");
317     }
318   if (optind != argc)
319     usage("Invalid usage");
320
321   generate_signatures(~0U);
322   build_search_tree();
323   
324   return 0;
325 }