return utf16_be_get_repl(p, uu, UNI_REPLACEMENT);
}
+// Internal helper for functions below
+static inline uint unicode_sanitize_char_internal(uint u, uint limit, uint allowed_ctl)
+{
+ if (u - 0x20 >= 0x60) // Otherwise printable ASCII 0x20..0x7f (optimized for this case)
+ {
+ if (u < 0x20)
+ {
+ if (!(1U << u) & allowed_ctl)
+ goto bad; // Latin-1 control chars except the allowed ones
+ }
+ else if (u >= 0xd800)
+ {
+ if (u < 0xf900) // 0xd800..0xf8ff Surrogate pair
+ goto bad;
+ if (u >= limit)
+ goto bad; // Outside Unicode range
+ }
+ else if (u < 0xa0)
+ goto bad; // 0x80..0x9f Latin-1 control chars
+ }
+ return u;
+bad:
+ return UNI_REPLACEMENT;
+}
+
/**
* Basic sanity check on Unicode characters. Return `UNI_REPLACEMENT` if the input
* character is a surrogate, ASCII or Latin-1 control character different from the tab,
**/
static inline uint unicode_sanitize_char(uint u)
{
- if (u >= 0x10000 || // We don't accept anything outside the basic plane
- u >= 0xd800 && u < 0xf900 || // neither we do surrogates and private use characters
- u >= 0x80 && u < 0xa0 || // nor latin-1 control chars
- u < 0x20 && u != '\t')
- return UNI_REPLACEMENT;
- return u;
+ return unicode_sanitize_char_internal(u, 0x10000, 1U<<'\t');
+}
+
+/**
+ * Basic sanity check on Unicode characters. Return `UNI_REPLACEMENT` if the input
+ * character is a surrogate, ASCII or Latin-1 control character different from the tab.
+ * In all other cases, it acts as an identity.
+ **/
+static inline uint unicode_full_sanitize_char(uint u)
+{
+ return unicode_sanitize_char_internal(u, 0x110000, 1U<<'\t');
}
/* unicode-utf8.c */