Minor cleanup of KMP:

author Martin Mares <mj@ucw.cz>

Tue, 20 Sep 2005 20:11:08 +0000 (20:11 +0000)

committer Martin Mares <mj@ucw.cz>

Tue, 20 Sep 2005 20:11:08 +0000 (20:11 +0000)
author Martin Mares <mj@ucw.cz>
Tue, 20 Sep 2005 20:11:08 +0000 (20:11 +0000)
committer Martin Mares <mj@ucw.cz>
Tue, 20 Sep 2005 20:11:08 +0000 (20:11 +0000)
diff --git a/lib/kmp.c b/lib/kmp.c

index 15aff3d1e98e80e8f776c7fa44518d2baad90317..70f1be27dd4f9d97566cb29eb5227f5b9ede11ce 100644 (file)
--- a/lib/kmp.c
+++ b/lib/kmp.c
@@ -1,7 +1,6 @@
  /*
- *     Knuth-Morris-Pratt's search automat for N given strings
- *             (taken from ad2 super-server and pruned :-)
- * 
+ *     Knuth-Morris-Pratt's Substring Search for N given strings
+ *
   *     (c) 1999--2005, Robert Spalek <robert@ucw.cz>
   */
  
@@ -11,7 +10,7 @@
  #include "sherlock/tagged-text.h"
  #include "lib/unicode.h"
  
-#define KMP_TRANSLATE(new_c, c, flags) NO_TRANSLATE(new_c, c, flags)
+#define KMP_GET_CHAR KMP_GET_RAW
  #include "lib/kmp.h"
  
  #include <stdlib.h>
diff --git a/lib/kmp.h b/lib/kmp.h

index 0f49cc74d28a5abd2955f14239965f83a6a382b8..abbf3b2312397bebcbd9aa1ef3ed81f0b0f18c8a 100644 (file)
--- a/lib/kmp.h
+++ b/lib/kmp.h
@@ -1,11 +1,17 @@
  /*
- *     Knuth-Morris-Pratt's search automat for N given strings
+ *     Knuth-Morris-Pratt's Substring Search for N given strings
   * 
   *     (c) 1999--2005, Robert Spalek <robert@ucw.cz>
   *
- *     Set KMP_TRANSLATE to one of {UNICODE,ASCII,NO}_TRANSLATE.
+ *     This is a preprocessor template: you need to set KMP_GET_CHAR
+ *     to a macro for reading a single character from the input and
+ *     translating it according to the MF_xxx flags. See below for
+ *     some pre-defined values.
   *
- *     Don't touch this file, because it is a mess.
+ *     Don't touch this file, it can bite.
+ *
+ *     (In fact, the algorithm is usually referred to as Aho-McCorasick,
+ *     but that's just an extension of KMP to multiple strings.)
   */
  
  #ifndef _UCW_KMP_H
@@ -13,38 +19,31 @@
  
  #include "lib/lists.h"
  
+/*
+ * Input conversion flags (the conversion is handled exclusively by the KMP_GET_CHAR
+ * macro, so you can define your own conversion modes, soo).
+ */
  #define        MF_TOLOWER      1
  #define        MF_UNACCENT     2
-#define        MF_ONLYALPHA    4
-       /* how to modify characters in the string */
-#define        CONTROL_CHAR    ':'
-       /* all non-alphabetic characters are treated as CONTROL_CHAR */
-
-#define UNICODE_TRANSLATE(new_c, c, flags)     do { \
-       if (!c) \
-               new_c = 0; \
-       else { \
-               if (flags & MF_TOLOWER) \
-                       c = Utolower(c); \
-               if (flags & MF_UNACCENT) \
-                       c = Uunaccent(c); \
-               if (flags & MF_ONLYALPHA && !Ualpha(c)) \
-                       c = CONTROL_CHAR; \
-               new_c = c; \
-       } \
-} while (0)
-#define ASCII_TRANSLATE(new_c, c, flags)       do { \
-       if (!c) \
-               new_c = 0; \
-       else { \
+#define        MF_ONLYALPHA    4       /* Convert non-alphas to KMP_CONTROL_CHAR */
+
+#define        KMP_CONTROL_CHAR ':'
+
+/* Pre-defined input functions */
+
+#define KMP_GET_RAW(pos, c, flags) c=*pos++
+
+#define KMP_GET_ASCII(pos, c, flags) do { \
+       c = *pos++; \
+       if (c) { \
                 if (flags & MF_TOLOWER) \
                         c = Clocase(c); \
                 if (flags & MF_ONLYALPHA && !Calpha(c)) \
-                       c = CONTROL_CHAR; \
-               new_c = c; \
+                       c = KMP_CONTROL_CHAR; \
         } \
  } while (0)
-#define NO_TRANSLATE(new_c, c, flags)  do { new_c = c; } while (0)
+
+/* Types and structures */
  
  typedef uns kmp_state_t;
  typedef word kmp_char_t;
@@ -78,6 +77,11 @@ struct kmp {
         struct kmp_output **out;        /* found words for every state */
  };
  
+struct kmp_result {
+       struct node n;                  /* strings with non-zero frequency are put into a link-list */
+       uns occur;
+};
+
  /* kmp.c */
  struct kmp *kmp_new(struct mempool *mp, int words_len, uns modify_flags);
  void kmp_enter_raw_string(struct kmp *kmp, const byte *str, uns id);
@@ -88,13 +92,9 @@ kmp_get_char(const byte **str, kmp_char_t *c, uns modify_flags UNUSED)
  {
         while (1)
         {
-               uns w;
                 kmp_char_t new_c;
-               GET_TAGGED_CHAR((*str), w);
-               if (w >= 0x80000000)
-                       w = CONTROL_CHAR;
-               KMP_TRANSLATE(new_c, w, modify_flags);
-               if (new_c != CONTROL_CHAR || *c != CONTROL_CHAR)
+               KMP_GET_CHAR((*str), new_c, modify_flags);
+               if (new_c != KMP_CONTROL_CHAR || *c != KMP_CONTROL_CHAR)
                 {
                         *c = new_c;
                         return;
@@ -105,10 +105,11 @@ kmp_get_char(const byte **str, kmp_char_t *c, uns modify_flags UNUSED)
  static inline void
  kmp_enter_string(struct kmp *kmp, const byte *str, uns id)
  {
-       /* Ugly hack to avoid linking libucw with libcharset:
-        * Convert the entered string here in the header and then call the
-        * library function.  This function calls kmp_get_char() without any
-        * conversion.  */
+       /* To avoid dependencies between libucw and other libraries (which might
+        * be referenced by the KMP_GET_CHAR macro), we have to split kmp_enter_string()
+        * to a conversion wrapper (this function) and the rest, which resides in kmp.c
+        * and uses KMP_GET_RAW to read its input.
+        */
         byte buf[3*strlen(str)+1], *str2 = buf;
         kmp_char_t c;
         do
@@ -120,11 +121,6 @@ kmp_enter_string(struct kmp *kmp, const byte *str, uns id)
         kmp_enter_raw_string(kmp, buf, id);
  }
  
-struct kmp_result {
-       struct node n;                  /* strings with non-zero frequency are put into a link-list */
-       uns occur;
-};
-
  static inline uns
  transition_hashf(struct kmp_transitions *l UNUSED, struct kmp_transition *tr)
  {
@@ -166,7 +162,7 @@ kmp_search_internal(struct kmp *kmp, byte *str, uns len, struct list *nonzeroes,
  {
         byte *str_end = str + len;
         kmp_state_t s = 0;
-       kmp_char_t c = CONTROL_CHAR;
+       kmp_char_t c = KMP_CONTROL_CHAR;
         struct kmp_transition tr, **prev;
         byte eof = 0;
         if (kmp->words_len <= 1)
@@ -204,8 +200,8 @@ kmp_search_internal(struct kmp *kmp, byte *str, uns len, struct list *nonzeroes,
                         kmp_get_char((const byte **)&str, &c, kmp->modify_flags);
                 if (!c)
                 {
-                       /* Insert CONTROL_CHAR at the beginning and at the end too.  */
-                       c = CONTROL_CHAR;
+                       /* Insert KMP_CONTROL_CHAR at the beginning and at the end too.  */
+                       c = KMP_CONTROL_CHAR;
                         eof = 1;
                 }
         }
author	Martin Mares <mj@ucw.cz>
	Tue, 20 Sep 2005 20:11:08 +0000 (20:11 +0000)
committer	Martin Mares <mj@ucw.cz>
	Tue, 20 Sep 2005 20:11:08 +0000 (20:11 +0000)
lib/kmp.c		patch \| blob \| history
lib/kmp.h		patch \| blob \| history