UTF-8: Added unicode_full_sanitize_char()

author Pavel Charvat <pchar@ucw.cz>

Fri, 29 Dec 2017 13:09:06 +0000 (14:09 +0100)

committer Pavel Charvat <pchar@ucw.cz>

Fri, 29 Dec 2017 13:09:06 +0000 (14:09 +0100)
author Pavel Charvat <pchar@ucw.cz>
Fri, 29 Dec 2017 13:09:06 +0000 (14:09 +0100)
committer Pavel Charvat <pchar@ucw.cz>
Fri, 29 Dec 2017 13:09:06 +0000 (14:09 +0100)
diff --git a/ucw/unicode.h b/ucw/unicode.h

index e6ad42ab97ef173a5065c80e9b36a7fa852e96a3..d84f5038c72c52c09058258c8a81f8d3c6892ba8 100644 (file)
--- a/ucw/unicode.h
+++ b/ucw/unicode.h
@@ -281,6 +281,31 @@ static inline void *utf16_be_get(const void *p, uint *uu)
    return utf16_be_get_repl(p, uu, UNI_REPLACEMENT);
  }
  
+// Internal helper for functions below
+static inline uint unicode_sanitize_char_internal(uint u, uint limit, uint allowed_ctl)
+{
+  if (u - 0x20 >= 0x60)                        // Otherwise printable ASCII 0x20..0x7f (optimized for this case)
+    {
+      if (u < 0x20)
+       {
+         if (!(1U << u) & allowed_ctl)
+           goto bad;                   // Latin-1 control chars except the allowed ones
+       }
+      else if (u >= 0xd800)
+       {
+         if (u < 0xf900)               // 0xd800..0xf8ff Surrogate pair
+           goto bad;
+         if (u >= limit)
+           goto bad;                   // Outside Unicode range
+       }
+      else if (u < 0xa0)
+       goto bad;                       // 0x80..0x9f Latin-1 control chars
+    }
+  return u;
+bad:
+  return UNI_REPLACEMENT;
+}
+
  /**
   * Basic sanity check on Unicode characters. Return `UNI_REPLACEMENT` if the input
   * character is a surrogate, ASCII or Latin-1 control character different from the tab,
@@ -288,12 +313,17 @@ static inline void *utf16_be_get(const void *p, uint *uu)
   **/
  static inline uint unicode_sanitize_char(uint u)
  {
-  if (u >= 0x10000 ||                  // We don't accept anything outside the basic plane
-      u >= 0xd800 && u < 0xf900 ||     // neither we do surrogates and private use characters
-      u >= 0x80 && u < 0xa0 ||         // nor latin-1 control chars
-      u < 0x20 && u != '\t')
-    return UNI_REPLACEMENT;
-  return u;
+  return unicode_sanitize_char_internal(u, 0x10000, 1U<<'\t');
+}
+
+/**
+ * Basic sanity check on Unicode characters. Return `UNI_REPLACEMENT` if the input
+ * character is a surrogate, ASCII or Latin-1 control character different from the tab.
+ * In all other cases, it acts as an identity.
+ **/
+static inline uint unicode_full_sanitize_char(uint u)
+{
+  return unicode_sanitize_char_internal(u, 0x110000, 1U<<'\t');
  }
  
  /* unicode-utf8.c */
author	Pavel Charvat <pchar@ucw.cz>
	Fri, 29 Dec 2017 13:09:06 +0000 (14:09 +0100)
committer	Pavel Charvat <pchar@ucw.cz>
	Fri, 29 Dec 2017 13:09:06 +0000 (14:09 +0100)