From 426d83a86afa628ad25dd3c0ba79ac407696f7c5 Mon Sep 17 00:00:00 2001 From: Pavel Charvat Date: Fri, 29 Dec 2017 14:45:26 +0100 Subject: [PATCH] UTF-8: Slightly changed sanitization. The disassembled code from my compiler seems to be OK now. --- ucw/unicode.h | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/ucw/unicode.h b/ucw/unicode.h index 8966167c..d40e8105 100644 --- a/ucw/unicode.h +++ b/ucw/unicode.h @@ -282,29 +282,26 @@ static inline void *utf16_be_get(const void *p, uint *uu) } // Internal helper for functions below -static inline uint unicode_sanitize_char_internal(uint u, uint limit, uint allowed_ctl, uint repl) +static inline bool unicode_check_char_internal(uint u, uint limit, uint allowed_ctl) { if (u - 0x20 >= 0x60) // Otherwise printable ASCII 0x20..0x7f (optimized for this case) { if (u < 0x20) { - if (!(1U << u) & allowed_ctl) - goto bad; // Latin-1 control chars except the allowed ones + if (!((1U << u) & allowed_ctl)) + return false; // Latin-1 control chars except the allowed ones } else if (u >= 0xd800) { if (u < 0xf900) // 0xd800..0xf8ff Surrogate pair - goto bad; - if (u >= limit) - goto bad; // Outside Unicode range + return false; + if (u > limit) + return false; // Outside allowed range } else if (u < 0xa0) - { // 0x80..0x9f Latin-1 control chars -bad: - u = repl; - } + return false; // 0x80..0x9f Latin-1 control chars } - return u; + return true; } /** @@ -314,7 +311,9 @@ bad: **/ static inline uint unicode_sanitize_char(uint u) { - return unicode_sanitize_char_internal(u, 0x10000, 1U<<'\t', UNI_REPLACEMENT); + if (!unicode_check_char_internal(u, 0xffff, 1U << '\t')) + u = UNI_REPLACEMENT; + return u; } /** @@ -324,7 +323,9 @@ static inline uint unicode_sanitize_char(uint u) **/ static inline uint unicode_full_sanitize_char(uint u) { - return unicode_sanitize_char_internal(u, 0x110000, 1U<<'\t', UNI_REPLACEMENT); + if (!unicode_check_char_internal(u, 0x10ffff, 1U << '\t')) + u = UNI_REPLACEMENT; + return u; } /* unicode-utf8.c */ -- 2.39.2