X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=lib%2Fkmp.h;h=fd37a6e30b30471cfa3202f58b0ea01dcf36d43d;hb=86919305a08aa88b3a60c3216752291ba7f0b496;hp=0f49cc74d28a5abd2955f14239965f83a6a382b8;hpb=4559b9d5b6ae1eabe667437f9dd2a141d5c2c1bb;p=libucw.git diff --git a/lib/kmp.h b/lib/kmp.h index 0f49cc74..fd37a6e3 100644 --- a/lib/kmp.h +++ b/lib/kmp.h @@ -1,11 +1,17 @@ /* - * Knuth-Morris-Pratt's search automat for N given strings + * Knuth-Morris-Pratt's Substring Search for N given strings * * (c) 1999--2005, Robert Spalek * - * Set KMP_TRANSLATE to one of {UNICODE,ASCII,NO}_TRANSLATE. + * This is a preprocessor template: you need to set KMP_GET_CHAR + * to a macro for reading a single character from the input and + * translating it according to the MF_xxx flags. See below for + * some pre-defined values. * - * Don't touch this file, because it is a mess. + * Don't touch this file, it can bite. + * + * (In fact, the algorithm is usually referred to as Aho-McCorasick, + * but that's just an extension of KMP to multiple strings.) */ #ifndef _UCW_KMP_H @@ -13,38 +19,33 @@ #include "lib/lists.h" +#include + +/* + * Input conversion flags (the conversion is handled exclusively by the KMP_GET_CHAR + * macro, so you can define your own conversion modes, soo). + */ #define MF_TOLOWER 1 #define MF_UNACCENT 2 -#define MF_ONLYALPHA 4 - /* how to modify characters in the string */ -#define CONTROL_CHAR ':' - /* all non-alphabetic characters are treated as CONTROL_CHAR */ - -#define UNICODE_TRANSLATE(new_c, c, flags) do { \ - if (!c) \ - new_c = 0; \ - else { \ - if (flags & MF_TOLOWER) \ - c = Utolower(c); \ - if (flags & MF_UNACCENT) \ - c = Uunaccent(c); \ - if (flags & MF_ONLYALPHA && !Ualpha(c)) \ - c = CONTROL_CHAR; \ - new_c = c; \ - } \ -} while (0) -#define ASCII_TRANSLATE(new_c, c, flags) do { \ - if (!c) \ - new_c = 0; \ - else { \ +#define MF_ONLYALPHA 4 /* Convert non-alphas to KMP_CONTROL_CHAR */ + +#define KMP_CONTROL_CHAR ':' + +/* Pre-defined input functions */ + +#define KMP_GET_UTF8(pos, c, flags) do { uns cc; pos = utf8_get(pos, &cc); c = cc; } while(0) + +#define KMP_GET_ASCII(pos, c, flags) do { \ + c = *pos++; \ + if (c) { \ if (flags & MF_TOLOWER) \ c = Clocase(c); \ if (flags & MF_ONLYALPHA && !Calpha(c)) \ - c = CONTROL_CHAR; \ - new_c = c; \ + c = KMP_CONTROL_CHAR; \ } \ } while (0) -#define NO_TRANSLATE(new_c, c, flags) do { new_c = c; } while (0) + +/* Types and structures */ typedef uns kmp_state_t; typedef word kmp_char_t; @@ -78,23 +79,24 @@ struct kmp { struct kmp_output **out; /* found words for every state */ }; +struct kmp_result { + struct node n; /* strings with non-zero frequency are put into a link-list */ + uns occur; +}; + /* kmp.c */ struct kmp *kmp_new(struct mempool *mp, int words_len, uns modify_flags); -void kmp_enter_raw_string(struct kmp *kmp, const byte *str, uns id); +void kmp_enter_raw_string(struct kmp *kmp, kmp_char_t *str, uns id); void kmp_build(struct kmp *kmp); static inline void -kmp_get_char(const byte **str, kmp_char_t *c, uns modify_flags UNUSED) +kmp_get_char(const byte **str UNUSED, kmp_char_t *c, uns modify_flags UNUSED) { while (1) { - uns w; kmp_char_t new_c; - GET_TAGGED_CHAR((*str), w); - if (w >= 0x80000000) - w = CONTROL_CHAR; - KMP_TRANSLATE(new_c, w, modify_flags); - if (new_c != CONTROL_CHAR || *c != CONTROL_CHAR) + KMP_GET_CHAR((*str), new_c, modify_flags); + if (new_c != KMP_CONTROL_CHAR || *c != KMP_CONTROL_CHAR) { *c = new_c; return; @@ -105,26 +107,21 @@ kmp_get_char(const byte **str, kmp_char_t *c, uns modify_flags UNUSED) static inline void kmp_enter_string(struct kmp *kmp, const byte *str, uns id) { - /* Ugly hack to avoid linking libucw with libcharset: - * Convert the entered string here in the header and then call the - * library function. This function calls kmp_get_char() without any - * conversion. */ - byte buf[3*strlen(str)+1], *str2 = buf; - kmp_char_t c; + /* To avoid dependencies between libucw and other libraries (which might + * be referenced by the KMP_GET_CHAR macro), we have to split kmp_enter_string() + * to a conversion wrapper (this function) and the rest, which resides in kmp.c + * and uses zero-terminated array of kmp_char_t characters as its input. + */ + kmp_char_t buf[strlen(str)+1], *str2 = buf, c = 0; do { kmp_get_char(&str, &c, kmp->modify_flags); - str2 = utf8_put(str2, c); + *str2++ = c; } while (c); kmp_enter_raw_string(kmp, buf, id); } -struct kmp_result { - struct node n; /* strings with non-zero frequency are put into a link-list */ - uns occur; -}; - static inline uns transition_hashf(struct kmp_transitions *l UNUSED, struct kmp_transition *tr) { @@ -164,9 +161,10 @@ kmp_search_internal(struct kmp *kmp, byte *str, uns len, struct list *nonzeroes, /* For every found string with id ID, it increments freq[ID]. * Also, it finds the longest among the leftmost matches. */ { - byte *str_end = str + len; + if (!len) + return NULL; kmp_state_t s = 0; - kmp_char_t c = CONTROL_CHAR; + kmp_char_t c = KMP_CONTROL_CHAR; struct kmp_transition tr, **prev; byte eof = 0; if (kmp->words_len <= 1) @@ -198,14 +196,14 @@ kmp_search_internal(struct kmp *kmp, byte *str, uns len, struct list *nonzeroes, } if (eof) break; - if (str >= str_end) + if (!--len) c = 0; else kmp_get_char((const byte **)&str, &c, kmp->modify_flags); if (!c) { - /* Insert CONTROL_CHAR at the beginning and at the end too. */ - c = CONTROL_CHAR; + /* Insert KMP_CONTROL_CHAR at the beginning and at the end too. */ + c = KMP_CONTROL_CHAR; eof = 1; } } @@ -213,9 +211,9 @@ kmp_search_internal(struct kmp *kmp, byte *str, uns len, struct list *nonzeroes, } static inline void -kmp_search(struct kmp *kmp, const byte *str, struct list *nonzeroes, struct kmp_result *freq) +kmp_search(struct kmp *kmp, const byte *str, uns len, struct list *nonzeroes, struct kmp_result *freq) { - kmp_search_internal(kmp, (byte*) str, strlen(str), nonzeroes, freq, NULL); + kmp_search_internal(kmp, (byte*) str, len, nonzeroes, freq, NULL); } static inline byte *