From 9dcccfcba5dcb33d3271ac8681f0c65a0a6d44f5 Mon Sep 17 00:00:00 2001 From: Pavel Charvat Date: Thu, 15 Mar 2007 21:56:46 +0100 Subject: [PATCH] libucw: added UTF-8 correctness checker --- lib/unicode-utf8.c | 29 +++++++++++++++++++++++++++++ lib/unicode.h | 1 + 2 files changed, 30 insertions(+) diff --git a/lib/unicode-utf8.c b/lib/unicode-utf8.c index 5f7b8186..8c8ac1bd 100644 --- a/lib/unicode-utf8.c +++ b/lib/unicode-utf8.c @@ -36,6 +36,35 @@ utf8_strnlen(byte *str, uns n) return len; } +uns +utf8_check(byte *s) +{ +#define UTF8_CHECK_NEXT if (unlikely((*s & 0xc0) != 0x80)) goto bad; s++ + while (*s) + { + uns u = *s++; + if (u < 0x80) + ; + else if (unlikely(u < 0xc0)) + { +bad: + return 0; + } + else if (u < 0xe0) + { + UTF8_CHECK_NEXT; + } + else if (likely(u < 0xf0)) + { + UTF8_CHECK_NEXT; + UTF8_CHECK_NEXT; + } + else + goto bad; + } + return 1; +} + #ifdef TEST #include #include diff --git a/lib/unicode.h b/lib/unicode.h index 79934540..5e8dac8f 100644 --- a/lib/unicode.h +++ b/lib/unicode.h @@ -204,5 +204,6 @@ utf8_encoding_len(uns c) uns utf8_strlen(byte *str); uns utf8_strnlen(byte *str, uns n); +uns utf8_check(byte *str); #endif -- 2.39.2