From: Martin Mares Date: Sun, 14 Mar 2004 12:58:40 +0000 (+0000) Subject: Our regex functions are now able to interface to old-style BSD re_match(), X-Git-Tag: holmes-import~1097 X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=97a9eb186ea7f6726258e164d75503ebb18fc490;p=libucw.git Our regex functions are now able to interface to old-style BSD re_match(), to POSIX regexec() and to libpcre. Currently it's switched to the BSD mode as before, I'll look at it more in the evening. --- diff --git a/lib/regex.c b/lib/regex.c index b9037ae3..b74fb6a2 100644 --- a/lib/regex.c +++ b/lib/regex.c @@ -1,7 +1,7 @@ /* - * Sherlock Library -- Regular Expressions + * Sherlock Library -- Interface to Regular Expression Libraries * - * (c) 1997 Martin Mares + * (c) 1997--2004 Martin Mares * (c) 2001 Robert Spalek * * This software may be freely distributed and used according to the terms @@ -10,10 +10,16 @@ #include "lib/lib.h" #include "lib/chartype.h" +#include "lib/hashfunc.h" #include #include #include + +#if 1 + +/* BSD regular expression library */ + #include #define INITIAL_MEM 1024 /* Initial space allocated for each pattern */ @@ -111,6 +117,196 @@ rx_subst(regex *r, byte *by, byte *src, byte *dest, uns destlen) return 1; } +#elif 0 + +/* POSIX regular expression library */ + +#include + +struct regex { + regex_t rx; + regmatch_t matches[10]; +}; + +regex * +rx_compile(byte *p, int icase) +{ + regex *r = xmalloc_zero(sizeof(regex)); + + int err = regcomp(&r->rx, p, REG_EXTENDED | (icase ? REG_ICASE : 0)); + if (err) + { + byte msg[256]; + regerror(err, &r->rx, msg, sizeof(msg)-1); + /* regfree(&r->rx) not needed */ + die("Error parsing regular expression `%s': %s", p, msg); + } + return r; +} + +void +rx_free(regex *r) +{ + regfree(&r->rx); + xfree(r); +} + +int +rx_match(regex *r, byte *s) +{ + int err = regexec(&r->rx, s, 10, r->matches, 0); + if (!err) + { + /* regexec doesn't support anchored expressions, so we have to check ourselves that the full string is matched */ + return !(r->matches[0].rm_so || s[r->matches[0].rm_eo]); + } + else if (err == REG_NOMATCH) + return 0; + else if (err == REG_ESPACE) + die("Regex matching ran out of memory"); + else + die("Regex matching failed with unknown error %d", err); +} + +int +rx_subst(regex *r, byte *by, byte *src, byte *dest, uns destlen) +{ + byte *end = dest + destlen - 1; + + if (!rx_match(r, src)) + return 0; + + while (*by) + { + if (*by == '\\') + { + by++; + if (*by >= '0' && *by <= '9') /* \0 gets replaced by entire pattern */ + { + uns j = *by++ - '0'; + if (j <= r->rx.re_nsub && r->matches[j].rm_so >= 0) + { + byte *s = src + r->matches[j].rm_so; + uns i = r->matches[j].rm_eo - r->matches[j].rm_so; + if (dest + i >= end) + return -1; + memcpy(dest, s, i); + dest += i; + continue; + } + } + } + if (dest < end) + *dest++ = *by++; + else + return -1; + } + *dest = 0; + return 1; +} + +#else + +/* PCRE library */ + +#include + +struct regex { + pcre *rx; + pcre_extra *extra; + uns match_array_size; + uns real_matches; + int matches[0]; /* (max_matches+1) pairs (pos,len) plus some workspace */ +}; + +regex * +rx_compile(byte *p, int icase) +{ + const char *err; + int errpos, match_array_size, eno; + + pcre *rx = pcre_compile(p, PCRE_ANCHORED | PCRE_EXTRA | (icase ? PCRE_CASELESS : 0), &err, &errpos, NULL); + if (!rx) + die("Error parsing regular expression `%s': %s at position %d", p, err, errpos); + eno = pcre_fullinfo(rx, NULL, PCRE_INFO_CAPTURECOUNT, &match_array_size); + if (eno) + die("Internal error: pcre_fullinfo() failed with error %d", eno); + match_array_size = 3*(match_array_size+1); + regex *r = xmalloc_zero(sizeof(regex) + match_array_size * sizeof(int)); + r->rx = rx; + r->match_array_size = match_array_size; + r->extra = pcre_study(r->rx, 0, &err); + if (err) + die("Error studying regular expression `%s': %s", p, err); + return r; +} + +void +rx_free(regex *r) +{ + xfree(r->rx); + xfree(r->extra); + xfree(r); +} + +int +rx_match(regex *r, byte *s) +{ + int len = str_len(s); + int err = pcre_exec(r->rx, r->extra, s, len, 0, 0, r->matches, r->match_array_size); + if (err >= 0) + { + r->real_matches = err; + /* need to check that the full string matches */ + return !(r->matches[0] || s[r->matches[1]]); + } + else if (err == PCRE_ERROR_NOMATCH) + return 0; + else if (err == PCRE_ERROR_NOMEMORY) + die("Regex matching ran out of memory"); + else + die("Regex matching failed with unknown error %d", err); +} + +int +rx_subst(regex *r, byte *by, byte *src, byte *dest, uns destlen) +{ + byte *end = dest + destlen - 1; + + if (!rx_match(r, src)) + return 0; + + while (*by) + { + if (*by == '\\') + { + by++; + if (*by >= '0' && *by <= '9') /* \0 gets replaced by entire pattern */ + { + uns j = *by++ - '0'; + if (j < r->real_matches && r->matches[2*j] >= 0) + { + byte *s = src + r->matches[2*j]; + uns i = r->matches[2*j+1] - r->matches[2*j]; + if (dest + i >= end) + return -1; + memcpy(dest, s, i); + dest += i; + continue; + } + } + } + if (dest < end) + *dest++ = *by++; + else + return -1; + } + *dest = 0; + return 1; +} + +#endif + #ifdef TEST int main(int argc, char **argv)