]> mj.ucw.cz Git - misc.git/blob - ukkonen.c
Gramatikator
[misc.git] / ukkonen.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4
5 #define INFTY 1000000
6
7 struct node {
8         struct node *first_son;
9         struct node *next_sibling;
10         struct node *backlink;          /* We use them only for internal nodes */
11         char *label;                    /* Label of edge above this node */
12         int lablen;
13 };
14
15 static void dump_tree(struct node *n, int level, char *strend)
16 {
17         for (int i=0; i<level; i++)
18                 putchar('.');
19         printf("<%.*s%s> [%p] {%p}\n",
20                 (n->lablen >= INFTY/2 ? strend-n->label : n->lablen), n->label,
21                 (n->lablen >= INFTY/2 ? "*" : ""), n, n->backlink);
22         for (n=n->first_son; n; n=n->next_sibling)
23                 dump_tree(n, level+1, strend);
24 }
25
26 static struct node *new_node(void)
27 {
28         struct node *n = malloc(sizeof(*n));
29         bzero(n, sizeof(*n));
30         return n;
31 }
32
33 static struct node *lookup_edge(struct node *x, int c)
34 {
35         x = x->first_son;
36         while (x && x->label[0] != c)
37                 x = x->next_sibling;
38         return x;
39 }
40
41 static struct node *build_tree(char *string)
42 {
43         struct node *root = new_node();
44         struct node *l = root;          /* Currently longest nested suffix as a canonical reference pair (node, label) */
45         char *llabel = string;
46         int llablen = 0;
47         struct node *x, *y, *z, *trash, **blp;
48
49         for (int i=0; string[i]; i++) {
50                 blp = &trash;
51                 for(;;) {
52                         while (llablen) {                               /* Canonicalize the reference pair */
53                                 x = lookup_edge(l, llabel[0]);
54                                 if (x->lablen > llablen)
55                                         break;
56                                 l = x;
57                                 llabel += x->lablen;
58                                 llablen -= x->lablen;
59                         }
60                         if (!llablen) {                                 /* LNS is an internal node or root */
61                                 *blp = l;
62                                 blp = &trash;
63                                 if (lookup_edge(l, string[i])) {                /* Found LNS for the new string */
64                                         llabel = string+i;
65                                         llablen = 1;
66                                         break;
67                                 }
68                                 z = new_node();
69                                 z->next_sibling = l->first_son;
70                                 l->first_son = z;
71                         } else {                                        /* LNS is in the middle of an edge */
72                                 x = lookup_edge(l, llabel[0]);
73                                 if (x->label[llablen] == string[i]) {           /* found LNS for the new string */
74                                         llablen++;
75                                         break;
76                                 }
77                                 y = new_node();                         /* New interior node which gets all sons of x */
78                                 z = new_node();                         /* New leaf connected below x */
79                                 y->first_son = x->first_son;
80                                 y->next_sibling = NULL;
81                                 y->backlink = x->backlink;
82                                 y->label = x->label + llablen;
83                                 y->lablen = x->lablen - llablen;
84                                 x->first_son = z;
85                                 x->backlink = NULL;
86                                 *blp = x;
87                                 blp = &x->backlink;                     /* Backlink will be filled in later */
88                                 x->lablen = llablen;
89                                 z->next_sibling = y;
90                         }
91                         z->label = string+i;
92                         z->lablen = INFTY;
93
94                         if (l == root) {                                /* Cut the first character of the LNS */
95                                 if (!llablen)
96                                         break;
97                                 llabel++;
98                                 llablen--;
99                         } else
100                                 l = l->backlink;
101                 }
102                 *blp = root;
103         }
104         return root;
105 }
106
107 int main(int argc, char **argv)
108 {
109         char *string = argv[1];
110         struct node *root = build_tree(string);
111         puts("Result:");
112         dump_tree(root, 0, string+strlen(string));
113         return 0;
114 }