Unicode: Reject denormalized UTF-8 sequences

author Martin Mares <mj@ucw.cz>

Wed, 8 Jul 2015 21:58:21 +0000 (23:58 +0200)

committer Martin Mares <mj@ucw.cz>

Wed, 8 Jul 2015 21:58:21 +0000 (23:58 +0200)
author Martin Mares <mj@ucw.cz>
Wed, 8 Jul 2015 21:58:21 +0000 (23:58 +0200)
committer Martin Mares <mj@ucw.cz>
Wed, 8 Jul 2015 21:58:21 +0000 (23:58 +0200)
diff --git a/ucw/ff-unicode.c b/ucw/ff-unicode.c

index 20d2cbddcff3c8da020a02e02a1387683f7a9478..e0faa0c85ae4aeb31a917bc87f34b24844522761 100644 (file)
--- a/ucw/ff-unicode.c
+++ b/ucw/ff-unicode.c
@@ -1,7 +1,7 @@
  /*
   *     UCW Library: Reading and writing of UTF-8 on Fastbuf Streams
   *
- *     (c) 2001--2004 Martin Mares <mj@ucw.cz>
+ *     (c) 2001--2015 Martin Mares <mj@ucw.cz>
   *     (c) 2004 Robert Spalek <robert@ucw.cz>
   *
   *     This software may be freely distributed and used according to the terms
@@ -41,6 +41,8 @@ bget_utf8_slow(struct fastbuf *b, uint repl)
        if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
         goto wrong;
        code = (code << 6) | (c & 0x3f);
+      if (code < 0x800)
+       goto wrong2;
      }
    else                                 /* 2 bytes */
      {
@@ -48,12 +50,15 @@ bget_utf8_slow(struct fastbuf *b, uint repl)
        if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
         goto wrong;
        code = (code << 6) | (c & 0x3f);
+      if (code < 0x80)
+       goto wrong2;
      }
    return code;
  
- wrong:
+wrong:
    if (c >= 0)
      bungetc(b);
+wrong2:
    return repl;
  }
  
@@ -63,6 +68,7 @@ bget_utf8_32_slow(struct fastbuf *b, uint repl)
    int c = bgetc(b);
    int code;
    int nr;
+  int limit;
  
    if (c < 0x80)                                /* Includes EOF */
      return c;
@@ -72,44 +78,48 @@ bget_utf8_32_slow(struct fastbuf *b, uint repl)
      {
        code = c & 0x1f;
        nr = 1;
+      limit = 0x80;
      }
    else if (c < 0xf0)
      {
        code = c & 0x0f;
        nr = 2;
+      limit = 0x800;
      }
    else if (c < 0xf8)
      {
        code = c & 0x07;
        nr = 3;
+      limit = 1 << 16;
      }
    else if (c < 0xfc)
      {
        code = c & 0x03;
        nr = 4;
+      limit = 1 << 21;
      }
    else if (c < 0xfe)
      {
        code = c & 0x01;
        nr = 5;
+      limit = 1 << 26;
      }
-  else                                 /* Too large, skip it */
-    {
-      while ((c = bgetc(b)) >= 0x80 && c < 0xc0)
-       ;
-      goto wrong;
-    }
+  else                                 /* Too large */
+    goto wrong2;
    while (nr-- > 0)
      {
        if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
         goto wrong;
        code = (code << 6) | (c & 0x3f);
      }
+  if (code < limit)
+    goto wrong2;
    return code;
  
- wrong:
+wrong:
    if (c >= 0)
      bungetc(b);
+wrong2:
    return repl;
  }
  
diff --git a/ucw/ff-unicode.h b/ucw/ff-unicode.h

index 8341eb8b42a26d12f4814cc852a2ae2e15b82ff6..79a0c4935be88e1b8b7f976f3ebffd6dec7c9627 100644 (file)
--- a/ucw/ff-unicode.h
+++ b/ucw/ff-unicode.h
@@ -1,7 +1,7 @@
  /*
   *     UCW Library: Reading and writing of UTF-8 and UTF-16 on Fastbuf Streams
   *
- *     (c) 2001--2004 Martin Mares <mj@ucw.cz>
+ *     (c) 2001--2015 Martin Mares <mj@ucw.cz>
   *     (c) 2004 Robert Spalek <robert@ucw.cz>
   *     (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
   *
diff --git a/ucw/ff-unicode.t b/ucw/ff-unicode.t

index ec089a66046bd4a59e25039752b0d0a1a5f5675a..82b8518c3ea5526d0ec8e5d05e2e7ac28f0df1fe 100644 (file)
--- a/ucw/ff-unicode.t
+++ b/ucw/ff-unicode.t
@@ -1,14 +1,48 @@
  # Tests for the Unicode module
  
-Name:  bput_utf8
+Name:  bput_utf8 ASCII
  Run:   ../obj/ucw/ff-unicode-t bput_utf8
  In:    0041 0048 004f 004a
  Out:   41 48 4f 4a
  
-Name:   bget_utf8_32
-Run:    ../obj/ucw/ff-unicode-t bget_utf8_32
-In:     fe 83 81
-Out:    fffc
+Name:  bput_utf8 BMP
+In:    00aa 01aa 02a5 05a5 0a5a 15a5 2a5a 5a5a a5a5
+Out:   c2 aa c6 aa ca a5 d6 a5 e0 a9 9a e1 96 a5 e2 a9 9a e5 a9 9a ea 96 a5
+
+Name:  bget_utf8 ASCII
+Run:   ../obj/ucw/ff-unicode-t bget_utf8
+In:    41 48 4f 4a
+Out:   0041 0048 004f 004a
+
+Name:  bget_utf8 BMP
+In:    c2 aa c6 aa ca a5 d6 a5 e0 a9 9a e1 96 a5 e2 a9 9a e5 a9 9a ea 96 a5
+Out:   00aa 01aa 02a5 05a5 0a5a 15a5 2a5a 5a5a a5a5
+
+Name:  bget_utf8 garbage
+In:    84 ff f9 f8 c2 aa 41
+Out:   fffc fffc fffc fffc 00aa 0041
+
+Name:  bget_utf8 denormalized
+In:    c1 bf  e0 9f bf
+Out:   fffc fffc
+
+Name:  bput_utf8_32
+Run:   ../obj/ucw/ff-unicode-t bput_utf8_32
+In:    15a5a 2a5a5 5a5a5 a5a5a 15a5a5 2a5a5a 5a5a5a a5a5a5 15a5a5a 2a5a5a5 5a5a5a5 a5a5a5a 15a5a5a5 2a5a5a5a 5a5a5a5a
+Out:   f0 95 a9 9a f0 aa 96 a5 f1 9a 96 a5 f2 a5 a9 9a f5 9a 96 a5 f8 8a a5 a9 9a f8 96 a5 a9 9a f8 a9 9a 96 a5 f9 96 a5 a9 9a fa a9 9a 96 a5 fc 85 a9 9a 96 a5 fc 8a 96 a5 a9 9a fc 95 a9 9a 96 a5 fc aa 96 a5 a9 9a fd 9a 96 a5 a9 9a
+
+Name:  bget_utf8_32
+Run:   ../obj/ucw/ff-unicode-t bget_utf8_32
+In:    f0 95 a9 9a f0 aa 96 a5 f1 9a 96 a5 f2 a5 a9 9a f5 9a 96 a5 f8 8a a5 a9 9a f8 96 a5 a9 9a f8 a9 9a 96 a5 f9 96 a5 a9 9a fa a9 9a 96 a5 fc 85 a9 9a 96 a5 fc 8a 96 a5 a9 9a fc 95 a9 9a 96 a5 fc aa 96 a5 a9 9a fd 9a 96 a5 a9 9a
+Out:   15a5a 2a5a5 5a5a5 a5a5a 15a5a5 2a5a5a 5a5a5a a5a5a5 15a5a5a 2a5a5a5 5a5a5a5 a5a5a5a 15a5a5a5 2a5a5a5a 5a5a5a5a
+
+Name:  bget_utf8_32 garbage
+In:    fe 83 81
+Out:   fffc fffc fffc
+
+Name:  bget_utf8_32 denormalized
+In:    c1 bf  e0 9f bf  f0 8f bf bf  f8 87 bf bf bf  fc 83 bf bf bf
+Out:   fffc fffc fffc fffc fffc
  
  Name:   bput_utf16_be
  Run:    ../obj/ucw/ff-unicode-t bput_utf16_be
@@ -20,22 +54,22 @@ Run:    ../obj/ucw/ff-unicode-t bput_utf16_le
  In:     0041 004a 2a5f feff 0000 10ffff ffff 10000
  Out:    41 00 4a 00 5f 2a ff fe 00 00 ff db ff df ff ff 00 d8 00 dc
  
-Name:   bget_utf16_be (1)
+Name:   bget_utf16_be
  Run:    ../obj/ucw/ff-unicode-t bget_utf16_be
  In:     00 41 00 4a 2a 5f fe ff 00 00 db ff df ff ff ff d8 00 dc 00
  Out:    0041 004a 2a5f feff 0000 10ffff ffff 10000
  
-Name:   bget_utf16_be (2)
+Name:   bget_utf16_be bad surrogates
  Run:    ../obj/ucw/ff-unicode-t bget_utf16_be
  In:     dc 1a 2a 5f d8 01 d8 01 2a 5f d8 01
  Out:    fffc 2a5f fffc 2a5f fffc
  
-Name:   bget_utf16_le (1)
+Name:   bget_utf16_le
  Run:    ../obj/ucw/ff-unicode-t bget_utf16_le
  In:     41 00 4a 00 5f 2a ff fe 00 00 ff db ff df ff ff 00 d8 00 dc
  Out:    0041 004a 2a5f feff 0000 10ffff ffff 10000
  
-Name:   bget_utf16_le (2)
+Name:   bget_utf16_le bad surrogates
  Run:    ../obj/ucw/ff-unicode-t bget_utf16_le
  In:     1a dc 5f 2a 01 d8 01 d8 5f 2a 01 d8
  Out:    fffc 2a5f fffc 2a5f fffc
diff --git a/ucw/unicode.h b/ucw/unicode.h

index a71b4baac20a605ee21f5d676af836de5e53d6a2..dd3d35b0a52e88705372f28348c58c790aea3269 100644 (file)
--- a/ucw/unicode.h
+++ b/ucw/unicode.h
@@ -89,6 +89,7 @@ put1: *p++ = 0x80 | (u & 0x3f);
  }
  
  #define UTF8_GET_NEXT if (unlikely((*p & 0xc0) != 0x80)) goto bad; u = (u << 6) | (*p++ & 0x3f)
+#define UTF8_CHECK_RANGE(r) if (unlikely(u < r)) goto bad
  
  /**
   * Decode a value from the range `[0, 0xFFFF]` (basic multilingual plane)
@@ -109,12 +110,14 @@ static inline byte *utf8_get_repl(const byte *p, uint *uu, uint repl)
      {
        u &= 0x1f;
        UTF8_GET_NEXT;
+      UTF8_CHECK_RANGE(0x80);
      }
    else if (likely(u < 0xf0))
      {
        u &= 0x0f;
        UTF8_GET_NEXT;
        UTF8_GET_NEXT;
+      UTF8_CHECK_RANGE(0x800);
      }
    else
      goto bad;
@@ -129,47 +132,56 @@ static inline byte *utf8_get_repl(const byte *p, uint *uu, uint repl)
  static inline byte *utf8_32_get_repl(const byte *p, uint *uu, uint repl)
  {
    uint u = *p++;
+  uint limit;
    if (u < 0x80)
      ;
    else if (unlikely(u < 0xc0))
-    {
-      /* Incorrect byte sequence */
-    bad:
-      u = repl;
-    }
+    goto bad;
    else if (u < 0xe0)
      {
        u &= 0x1f;
+      limit = 0x80;
        goto get1;
      }
    else if (u < 0xf0)
      {
        u &= 0x0f;
+      limit = 0x800;
        goto get2;
      }
    else if (u < 0xf8)
      {
        u &= 0x07;
+      limit = 1 << 16;
        goto get3;
      }
    else if (u < 0xfc)
      {
        u &= 0x03;
+      limit = 1 << 21;
        goto get4;
      }
    else if (u < 0xfe)
      {
        u &= 0x01;
+      limit = 1 << 26;
        UTF8_GET_NEXT;
  get4: UTF8_GET_NEXT;
  get3: UTF8_GET_NEXT;
  get2: UTF8_GET_NEXT;
  get1: UTF8_GET_NEXT;
+      if (unlikely(u < limit))
+       goto bad;
      }
    else
      goto bad;
    *uu = u;
    return (byte *)p;
+
+bad:
+  /* Incorrect byte sequence */
+  *uu = repl;
+  return (byte *)p;
  }
  
  /**
diff --git a/ucw/unicode.t b/ucw/unicode.t

index 94e55f29a132970cb8d3be41898bc143c6c0a046..f239f32ac5f0761c40c7cf4ce66fcc19b1caac12 100644 (file)
--- a/ucw/unicode.t
+++ b/ucw/unicode.t
@@ -1,71 +1,72 @@
  # Tests for the Unicode module
  
-Name:  utf8_put (1)
+Name:  utf8_put ASCII
  Run:   ../obj/ucw/unicode-t utf8_put
  In:    0041 0048 004f 004a
  Out:   41 48 4f 4a
  
-Name:  utf8_put (2)
-Run:   ../obj/ucw/unicode-t utf8_put
+Name:  utf8_put BMP
  In:    00aa 01aa 02a5 05a5 0a5a 15a5 2a5a 5a5a a5a5
  Out:   c2 aa c6 aa ca a5 d6 a5 e0 a9 9a e1 96 a5 e2 a9 9a e5 a9 9a ea 96 a5
  
-Name:  utf8_get (1)
+Name:  utf8_get ASCII
  Run:   ../obj/ucw/unicode-t utf8_get
  In:    41 48 4f 4a
  Out:   0041 0048 004f 004a
  
-Name:  utf8_get (2)
-Run:   ../obj/ucw/unicode-t utf8_get
+Name:  utf8_get BMP
  In:    c2 aa c6 aa ca a5 d6 a5 e0 a9 9a e1 96 a5 e2 a9 9a e5 a9 9a ea 96 a5
  Out:   00aa 01aa 02a5 05a5 0a5a 15a5 2a5a 5a5a a5a5
  
-Name:  utf8_get (3)
-Run:   ../obj/ucw/unicode-t utf8_get
+Name:  utf8_get garbage
  In:    84 ff f9 f8 c2 aa 41
  Out:   fffc fffc fffc fffc 00aa 0041
  
+Name:  utf8_get denormalized
+In:    c1 bf  e0 9f bf
+Out:   fffc fffc
+
  Name:  utf8_32_put
  Run:   ../obj/ucw/unicode-t utf8_32_put
  In:    15a5a 2a5a5 5a5a5 a5a5a 15a5a5 2a5a5a 5a5a5a a5a5a5 15a5a5a 2a5a5a5 5a5a5a5 a5a5a5a 15a5a5a5 2a5a5a5a 5a5a5a5a
  Out:   f0 95 a9 9a f0 aa 96 a5 f1 9a 96 a5 f2 a5 a9 9a f5 9a 96 a5 f8 8a a5 a9 9a f8 96 a5 a9 9a f8 a9 9a 96 a5 f9 96 a5 a9 9a fa a9 9a 96 a5 fc 85 a9 9a 96 a5 fc 8a 96 a5 a9 9a fc 95 a9 9a 96 a5 fc aa 96 a5 a9 9a fd 9a 96 a5 a9 9a
  
-Name:  utf8_32_get (1)
+Name:  utf8_32_get
  Run:   ../obj/ucw/unicode-t utf8_32_get
  In:    f0 95 a9 9a f0 aa 96 a5 f1 9a 96 a5 f2 a5 a9 9a f5 9a 96 a5 f8 8a a5 a9 9a f8 96 a5 a9 9a f8 a9 9a 96 a5 f9 96 a5 a9 9a fa a9 9a 96 a5 fc 85 a9 9a 96 a5 fc 8a 96 a5 a9 9a fc 95 a9 9a 96 a5 fc aa 96 a5 a9 9a fd 9a 96 a5 a9 9a
  Out:   15a5a 2a5a5 5a5a5 a5a5a 15a5a5 2a5a5a 5a5a5a a5a5a5 15a5a5a 2a5a5a5 5a5a5a5 a5a5a5a 15a5a5a5 2a5a5a5a 5a5a5a5a
  
-Name:  utf8_32_get (2)
-Run:   ../obj/ucw/unicode-t utf8_32_get
+Name:  utf8_32_get garbage
  In:    fe 83 81
  Out:   fffc fffc fffc
  
+Name:  utf8_32_get denormalized
+In:    c1 bf  e0 9f bf  f0 8f bf bf  f8 87 bf bf bf  fc 83 bf bf bf
+Out:   fffc fffc fffc fffc fffc
+
  Name:  utf16_be_put
  Run:   ../obj/ucw/unicode-t utf16_be_put
  In:    0041 004a 2a5f feff 0000 10ffff ffff 10000
  Out:   00 41 00 4a 2a 5f fe ff 00 00 db ff df ff ff ff d8 00 dc 00
  
  Name:  utf16_le_put
-Run:   ../obj/ucw/unicode-t utf16_le_put
  In:    0041 004a 2a5f feff 0000 10ffff ffff 10000
  Out:   41 00 4a 00 5f 2a ff fe 00 00 ff db ff df ff ff 00 d8 00 dc
  
-Name:  utf16_be_get (1)
+Name:  utf16_be_get
  Run:   ../obj/ucw/unicode-t utf16_be_get
  In:    00 41 00 4a 2a 5f fe ff 00 00 db ff df ff ff ff d8 00 dc 00
  Out:   0041 004a 2a5f feff 0000 10ffff ffff 10000
  
-Name:  utf16_be_get (2)
-Run:   ../obj/ucw/unicode-t utf16_be_get
+Name:  utf16_be_get bad surrogates
  In:    dc 1a 2a 5f d8 01 d8 01 2a 5f d8 01
  Out:   fffc 2a5f fffc fffc 2a5f fffc
  
-Name:  utf16_le_get (1)
+Name:  utf16_le_get
  Run:   ../obj/ucw/unicode-t utf16_le_get
  In:    41 00 4a 00 5f 2a ff fe 00 00 ff db ff df ff ff 00 d8 00 dc
  Out:   0041 004a 2a5f feff 0000 10ffff ffff 10000
  
-Name:  utf16_le_get (2)
-Run:   ../obj/ucw/unicode-t utf16_le_get
+Name:  utf16_le_get bad surrogates
  In:    1a dc 5f 2a 01 d8 01 d8 5f 2a 01 d8
  Out:   fffc 2a5f fffc fffc 2a5f fffc
author	Martin Mares <mj@ucw.cz>
	Wed, 8 Jul 2015 21:58:21 +0000 (23:58 +0200)
committer	Martin Mares <mj@ucw.cz>
	Wed, 8 Jul 2015 21:58:21 +0000 (23:58 +0200)
ucw/ff-unicode.c		patch \| blob \| history
ucw/ff-unicode.h		patch \| blob \| history
ucw/ff-unicode.t		patch \| blob \| history
ucw/unicode.h		patch \| blob \| history
ucw/unicode.t		patch \| blob \| history