From 94abc5743c1faf0a254408280c5aa61ed0f034c7 Mon Sep 17 00:00:00 2001 From: Pavel Charvat Date: Fri, 29 Dec 2017 14:09:06 +0100 Subject: [PATCH] UTF-8: Added unicode_full_sanitize_char() --- ucw/unicode.h | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/ucw/unicode.h b/ucw/unicode.h index e6ad42ab..d84f5038 100644 --- a/ucw/unicode.h +++ b/ucw/unicode.h @@ -281,6 +281,31 @@ static inline void *utf16_be_get(const void *p, uint *uu) return utf16_be_get_repl(p, uu, UNI_REPLACEMENT); } +// Internal helper for functions below +static inline uint unicode_sanitize_char_internal(uint u, uint limit, uint allowed_ctl) +{ + if (u - 0x20 >= 0x60) // Otherwise printable ASCII 0x20..0x7f (optimized for this case) + { + if (u < 0x20) + { + if (!(1U << u) & allowed_ctl) + goto bad; // Latin-1 control chars except the allowed ones + } + else if (u >= 0xd800) + { + if (u < 0xf900) // 0xd800..0xf8ff Surrogate pair + goto bad; + if (u >= limit) + goto bad; // Outside Unicode range + } + else if (u < 0xa0) + goto bad; // 0x80..0x9f Latin-1 control chars + } + return u; +bad: + return UNI_REPLACEMENT; +} + /** * Basic sanity check on Unicode characters. Return `UNI_REPLACEMENT` if the input * character is a surrogate, ASCII or Latin-1 control character different from the tab, @@ -288,12 +313,17 @@ static inline void *utf16_be_get(const void *p, uint *uu) **/ static inline uint unicode_sanitize_char(uint u) { - if (u >= 0x10000 || // We don't accept anything outside the basic plane - u >= 0xd800 && u < 0xf900 || // neither we do surrogates and private use characters - u >= 0x80 && u < 0xa0 || // nor latin-1 control chars - u < 0x20 && u != '\t') - return UNI_REPLACEMENT; - return u; + return unicode_sanitize_char_internal(u, 0x10000, 1U<<'\t'); +} + +/** + * Basic sanity check on Unicode characters. Return `UNI_REPLACEMENT` if the input + * character is a surrogate, ASCII or Latin-1 control character different from the tab. + * In all other cases, it acts as an identity. + **/ +static inline uint unicode_full_sanitize_char(uint u) +{ + return unicode_sanitize_char_internal(u, 0x110000, 1U<<'\t'); } /* unicode-utf8.c */ -- 2.39.2