From 4905e7908d5a7d37ac1b0d0ac18243b9ae6a381d Mon Sep 17 00:00:00 2001
From: Martin Mares <mj@ucw.cz>
Date: Wed, 8 Jul 2015 23:58:21 +0200
Subject: [PATCH] Unicode: Reject denormalized UTF-8 sequences

When there are multiple possible encodings of a single Unicode
codepoint, the standard requires to use the shortest one. Not checking
this requirement on input has been observed to cause weird security
problems in some software, so better be careful.
---
 ucw/ff-unicode.c | 28 +++++++++++++++++---------
 ucw/ff-unicode.h |  2 +-
 ucw/ff-unicode.t | 52 +++++++++++++++++++++++++++++++++++++++---------
 ucw/unicode.h    | 22 +++++++++++++++-----
 ucw/unicode.t    | 37 +++++++++++++++++-----------------
 5 files changed, 99 insertions(+), 42 deletions(-)

diff --git a/ucw/ff-unicode.c b/ucw/ff-unicode.c
index 20d2cbdd..e0faa0c8 100644
--- a/ucw/ff-unicode.c
+++ b/ucw/ff-unicode.c
@@ -1,7 +1,7 @@
 /*
  *	UCW Library: Reading and writing of UTF-8 on Fastbuf Streams
  *
- *	(c) 2001--2004 Martin Mares <mj@ucw.cz>
+ *	(c) 2001--2015 Martin Mares <mj@ucw.cz>
  *	(c) 2004 Robert Spalek <robert@ucw.cz>
  *
  *	This software may be freely distributed and used according to the terms
@@ -41,6 +41,8 @@ bget_utf8_slow(struct fastbuf *b, uint repl)
       if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
 	goto wrong;
       code = (code << 6) | (c & 0x3f);
+      if (code < 0x800)
+	goto wrong2;
     }
   else					/* 2 bytes */
     {
@@ -48,12 +50,15 @@ bget_utf8_slow(struct fastbuf *b, uint repl)
       if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
 	goto wrong;
       code = (code << 6) | (c & 0x3f);
+      if (code < 0x80)
+	goto wrong2;
     }
   return code;
 
- wrong:
+wrong:
   if (c >= 0)
     bungetc(b);
+wrong2:
   return repl;
 }
 
@@ -63,6 +68,7 @@ bget_utf8_32_slow(struct fastbuf *b, uint repl)
   int c = bgetc(b);
   int code;
   int nr;
+  int limit;
 
   if (c < 0x80)				/* Includes EOF */
     return c;
@@ -72,44 +78,48 @@ bget_utf8_32_slow(struct fastbuf *b, uint repl)
     {
       code = c & 0x1f;
       nr = 1;
+      limit = 0x80;
     }
   else if (c < 0xf0)
     {
       code = c & 0x0f;
       nr = 2;
+      limit = 0x800;
     }
   else if (c < 0xf8)
     {
       code = c & 0x07;
       nr = 3;
+      limit = 1 << 16;
     }
   else if (c < 0xfc)
     {
       code = c & 0x03;
       nr = 4;
+      limit = 1 << 21;
     }
   else if (c < 0xfe)
     {
       code = c & 0x01;
       nr = 5;
+      limit = 1 << 26;
     }
-  else					/* Too large, skip it */
-    {
-      while ((c = bgetc(b)) >= 0x80 && c < 0xc0)
-	;
-      goto wrong;
-    }
+  else					/* Too large */
+    goto wrong2;
   while (nr-- > 0)
     {
       if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
 	goto wrong;
       code = (code << 6) | (c & 0x3f);
     }
+  if (code < limit)
+    goto wrong2;
   return code;
 
- wrong:
+wrong:
   if (c >= 0)
     bungetc(b);
+wrong2:
   return repl;
 }
 
diff --git a/ucw/ff-unicode.h b/ucw/ff-unicode.h
index 8341eb8b..79a0c493 100644
--- a/ucw/ff-unicode.h
+++ b/ucw/ff-unicode.h
@@ -1,7 +1,7 @@
 /*
  *	UCW Library: Reading and writing of UTF-8 and UTF-16 on Fastbuf Streams
  *
- *	(c) 2001--2004 Martin Mares <mj@ucw.cz>
+ *	(c) 2001--2015 Martin Mares <mj@ucw.cz>
  *	(c) 2004 Robert Spalek <robert@ucw.cz>
  *	(c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
  *
diff --git a/ucw/ff-unicode.t b/ucw/ff-unicode.t
index ec089a66..82b8518c 100644
--- a/ucw/ff-unicode.t
+++ b/ucw/ff-unicode.t
@@ -1,14 +1,48 @@
 # Tests for the Unicode module
 
-Name:	bput_utf8
+Name:	bput_utf8 ASCII
 Run:	../obj/ucw/ff-unicode-t bput_utf8
 In:	0041 0048 004f 004a
 Out:	41 48 4f 4a
 
-Name:   bget_utf8_32
-Run:    ../obj/ucw/ff-unicode-t bget_utf8_32
-In:     fe 83 81
-Out:    fffc
+Name:	bput_utf8 BMP
+In:	00aa 01aa 02a5 05a5 0a5a 15a5 2a5a 5a5a a5a5
+Out:	c2 aa c6 aa ca a5 d6 a5 e0 a9 9a e1 96 a5 e2 a9 9a e5 a9 9a ea 96 a5
+
+Name:	bget_utf8 ASCII
+Run:	../obj/ucw/ff-unicode-t bget_utf8
+In:	41 48 4f 4a
+Out:	0041 0048 004f 004a
+
+Name:	bget_utf8 BMP
+In:	c2 aa c6 aa ca a5 d6 a5 e0 a9 9a e1 96 a5 e2 a9 9a e5 a9 9a ea 96 a5
+Out:	00aa 01aa 02a5 05a5 0a5a 15a5 2a5a 5a5a a5a5
+
+Name:	bget_utf8 garbage
+In:	84 ff f9 f8 c2 aa 41
+Out:	fffc fffc fffc fffc 00aa 0041
+
+Name:	bget_utf8 denormalized
+In:	c1 bf  e0 9f bf
+Out:	fffc fffc
+
+Name:	bput_utf8_32
+Run:	../obj/ucw/ff-unicode-t bput_utf8_32
+In:	15a5a 2a5a5 5a5a5 a5a5a 15a5a5 2a5a5a 5a5a5a a5a5a5 15a5a5a 2a5a5a5 5a5a5a5 a5a5a5a 15a5a5a5 2a5a5a5a 5a5a5a5a
+Out:	f0 95 a9 9a f0 aa 96 a5 f1 9a 96 a5 f2 a5 a9 9a f5 9a 96 a5 f8 8a a5 a9 9a f8 96 a5 a9 9a f8 a9 9a 96 a5 f9 96 a5 a9 9a fa a9 9a 96 a5 fc 85 a9 9a 96 a5 fc 8a 96 a5 a9 9a fc 95 a9 9a 96 a5 fc aa 96 a5 a9 9a fd 9a 96 a5 a9 9a
+
+Name:	bget_utf8_32
+Run:	../obj/ucw/ff-unicode-t bget_utf8_32
+In:	f0 95 a9 9a f0 aa 96 a5 f1 9a 96 a5 f2 a5 a9 9a f5 9a 96 a5 f8 8a a5 a9 9a f8 96 a5 a9 9a f8 a9 9a 96 a5 f9 96 a5 a9 9a fa a9 9a 96 a5 fc 85 a9 9a 96 a5 fc 8a 96 a5 a9 9a fc 95 a9 9a 96 a5 fc aa 96 a5 a9 9a fd 9a 96 a5 a9 9a
+Out:	15a5a 2a5a5 5a5a5 a5a5a 15a5a5 2a5a5a 5a5a5a a5a5a5 15a5a5a 2a5a5a5 5a5a5a5 a5a5a5a 15a5a5a5 2a5a5a5a 5a5a5a5a
+
+Name:	bget_utf8_32 garbage
+In:	fe 83 81
+Out:	fffc fffc fffc
+
+Name:	bget_utf8_32 denormalized
+In:	c1 bf  e0 9f bf  f0 8f bf bf  f8 87 bf bf bf  fc 83 bf bf bf
+Out:	fffc fffc fffc fffc fffc
 
 Name:   bput_utf16_be
 Run:    ../obj/ucw/ff-unicode-t bput_utf16_be
@@ -20,22 +54,22 @@ Run:    ../obj/ucw/ff-unicode-t bput_utf16_le
 In:     0041 004a 2a5f feff 0000 10ffff ffff 10000
 Out:    41 00 4a 00 5f 2a ff fe 00 00 ff db ff df ff ff 00 d8 00 dc
 
-Name:   bget_utf16_be (1)
+Name:   bget_utf16_be
 Run:    ../obj/ucw/ff-unicode-t bget_utf16_be
 In:     00 41 00 4a 2a 5f fe ff 00 00 db ff df ff ff ff d8 00 dc 00
 Out:    0041 004a 2a5f feff 0000 10ffff ffff 10000
 
-Name:   bget_utf16_be (2)
+Name:   bget_utf16_be bad surrogates
 Run:    ../obj/ucw/ff-unicode-t bget_utf16_be
 In:     dc 1a 2a 5f d8 01 d8 01 2a 5f d8 01
 Out:    fffc 2a5f fffc 2a5f fffc
 
-Name:   bget_utf16_le (1)
+Name:   bget_utf16_le
 Run:    ../obj/ucw/ff-unicode-t bget_utf16_le
 In:     41 00 4a 00 5f 2a ff fe 00 00 ff db ff df ff ff 00 d8 00 dc
 Out:    0041 004a 2a5f feff 0000 10ffff ffff 10000
 
-Name:   bget_utf16_le (2)
+Name:   bget_utf16_le bad surrogates
 Run:    ../obj/ucw/ff-unicode-t bget_utf16_le
 In:     1a dc 5f 2a 01 d8 01 d8 5f 2a 01 d8
 Out:    fffc 2a5f fffc 2a5f fffc
diff --git a/ucw/unicode.h b/ucw/unicode.h
index a71b4baa..dd3d35b0 100644
--- a/ucw/unicode.h
+++ b/ucw/unicode.h
@@ -89,6 +89,7 @@ put1: *p++ = 0x80 | (u & 0x3f);
 }
 
 #define UTF8_GET_NEXT if (unlikely((*p & 0xc0) != 0x80)) goto bad; u = (u << 6) | (*p++ & 0x3f)
+#define UTF8_CHECK_RANGE(r) if (unlikely(u < r)) goto bad
 
 /**
  * Decode a value from the range `[0, 0xFFFF]` (basic multilingual plane)
@@ -109,12 +110,14 @@ static inline byte *utf8_get_repl(const byte *p, uint *uu, uint repl)
     {
       u &= 0x1f;
       UTF8_GET_NEXT;
+      UTF8_CHECK_RANGE(0x80);
     }
   else if (likely(u < 0xf0))
     {
       u &= 0x0f;
       UTF8_GET_NEXT;
       UTF8_GET_NEXT;
+      UTF8_CHECK_RANGE(0x800);
     }
   else
     goto bad;
@@ -129,47 +132,56 @@ static inline byte *utf8_get_repl(const byte *p, uint *uu, uint repl)
 static inline byte *utf8_32_get_repl(const byte *p, uint *uu, uint repl)
 {
   uint u = *p++;
+  uint limit;
   if (u < 0x80)
     ;
   else if (unlikely(u < 0xc0))
-    {
-      /* Incorrect byte sequence */
-    bad:
-      u = repl;
-    }
+    goto bad;
   else if (u < 0xe0)
     {
       u &= 0x1f;
+      limit = 0x80;
       goto get1;
     }
   else if (u < 0xf0)
     {
       u &= 0x0f;
+      limit = 0x800;
       goto get2;
     }
   else if (u < 0xf8)
     {
       u &= 0x07;
+      limit = 1 << 16;
       goto get3;
     }
   else if (u < 0xfc)
     {
       u &= 0x03;
+      limit = 1 << 21;
       goto get4;
     }
   else if (u < 0xfe)
     {
       u &= 0x01;
+      limit = 1 << 26;
       UTF8_GET_NEXT;
 get4: UTF8_GET_NEXT;
 get3: UTF8_GET_NEXT;
 get2: UTF8_GET_NEXT;
 get1: UTF8_GET_NEXT;
+      if (unlikely(u < limit))
+	goto bad;
     }
   else
     goto bad;
   *uu = u;
   return (byte *)p;
+
+bad:
+  /* Incorrect byte sequence */
+  *uu = repl;
+  return (byte *)p;
 }
 
 /**
diff --git a/ucw/unicode.t b/ucw/unicode.t
index 94e55f29..f239f32a 100644
--- a/ucw/unicode.t
+++ b/ucw/unicode.t
@@ -1,71 +1,72 @@
 # Tests for the Unicode module
 
-Name:	utf8_put (1)
+Name:	utf8_put ASCII
 Run:	../obj/ucw/unicode-t utf8_put
 In:	0041 0048 004f 004a
 Out:	41 48 4f 4a
 
-Name:	utf8_put (2)
-Run:	../obj/ucw/unicode-t utf8_put
+Name:	utf8_put BMP
 In:	00aa 01aa 02a5 05a5 0a5a 15a5 2a5a 5a5a a5a5
 Out:	c2 aa c6 aa ca a5 d6 a5 e0 a9 9a e1 96 a5 e2 a9 9a e5 a9 9a ea 96 a5
 
-Name:	utf8_get (1)
+Name:	utf8_get ASCII
 Run:	../obj/ucw/unicode-t utf8_get
 In:	41 48 4f 4a
 Out:	0041 0048 004f 004a
 
-Name:	utf8_get (2)
-Run:	../obj/ucw/unicode-t utf8_get
+Name:	utf8_get BMP
 In:	c2 aa c6 aa ca a5 d6 a5 e0 a9 9a e1 96 a5 e2 a9 9a e5 a9 9a ea 96 a5
 Out:	00aa 01aa 02a5 05a5 0a5a 15a5 2a5a 5a5a a5a5
 
-Name:	utf8_get (3)
-Run:	../obj/ucw/unicode-t utf8_get
+Name:	utf8_get garbage
 In:	84 ff f9 f8 c2 aa 41
 Out:	fffc fffc fffc fffc 00aa 0041
 
+Name:	utf8_get denormalized
+In:	c1 bf  e0 9f bf
+Out:	fffc fffc
+
 Name:	utf8_32_put
 Run:	../obj/ucw/unicode-t utf8_32_put
 In:	15a5a 2a5a5 5a5a5 a5a5a 15a5a5 2a5a5a 5a5a5a a5a5a5 15a5a5a 2a5a5a5 5a5a5a5 a5a5a5a 15a5a5a5 2a5a5a5a 5a5a5a5a
 Out:	f0 95 a9 9a f0 aa 96 a5 f1 9a 96 a5 f2 a5 a9 9a f5 9a 96 a5 f8 8a a5 a9 9a f8 96 a5 a9 9a f8 a9 9a 96 a5 f9 96 a5 a9 9a fa a9 9a 96 a5 fc 85 a9 9a 96 a5 fc 8a 96 a5 a9 9a fc 95 a9 9a 96 a5 fc aa 96 a5 a9 9a fd 9a 96 a5 a9 9a
 
-Name:	utf8_32_get (1)
+Name:	utf8_32_get
 Run:	../obj/ucw/unicode-t utf8_32_get
 In:	f0 95 a9 9a f0 aa 96 a5 f1 9a 96 a5 f2 a5 a9 9a f5 9a 96 a5 f8 8a a5 a9 9a f8 96 a5 a9 9a f8 a9 9a 96 a5 f9 96 a5 a9 9a fa a9 9a 96 a5 fc 85 a9 9a 96 a5 fc 8a 96 a5 a9 9a fc 95 a9 9a 96 a5 fc aa 96 a5 a9 9a fd 9a 96 a5 a9 9a
 Out:	15a5a 2a5a5 5a5a5 a5a5a 15a5a5 2a5a5a 5a5a5a a5a5a5 15a5a5a 2a5a5a5 5a5a5a5 a5a5a5a 15a5a5a5 2a5a5a5a 5a5a5a5a
 
-Name:	utf8_32_get (2)
-Run:	../obj/ucw/unicode-t utf8_32_get
+Name:	utf8_32_get garbage
 In:	fe 83 81
 Out:	fffc fffc fffc
 
+Name:	utf8_32_get denormalized
+In:	c1 bf  e0 9f bf  f0 8f bf bf  f8 87 bf bf bf  fc 83 bf bf bf
+Out:	fffc fffc fffc fffc fffc
+
 Name:	utf16_be_put
 Run:	../obj/ucw/unicode-t utf16_be_put
 In:	0041 004a 2a5f feff 0000 10ffff ffff 10000
 Out:	00 41 00 4a 2a 5f fe ff 00 00 db ff df ff ff ff d8 00 dc 00
 
 Name:	utf16_le_put
-Run:	../obj/ucw/unicode-t utf16_le_put
 In:	0041 004a 2a5f feff 0000 10ffff ffff 10000
 Out:	41 00 4a 00 5f 2a ff fe 00 00 ff db ff df ff ff 00 d8 00 dc
 
-Name:	utf16_be_get (1)
+Name:	utf16_be_get
 Run:	../obj/ucw/unicode-t utf16_be_get
 In:	00 41 00 4a 2a 5f fe ff 00 00 db ff df ff ff ff d8 00 dc 00
 Out:	0041 004a 2a5f feff 0000 10ffff ffff 10000
 
-Name:	utf16_be_get (2)
-Run:	../obj/ucw/unicode-t utf16_be_get
+Name:	utf16_be_get bad surrogates
 In:	dc 1a 2a 5f d8 01 d8 01 2a 5f d8 01
 Out:	fffc 2a5f fffc fffc 2a5f fffc
 
-Name:	utf16_le_get (1)
+Name:	utf16_le_get
 Run:	../obj/ucw/unicode-t utf16_le_get
 In:	41 00 4a 00 5f 2a ff fe 00 00 ff db ff df ff ff 00 d8 00 dc
 Out:	0041 004a 2a5f feff 0000 10ffff ffff 10000
 
-Name:	utf16_le_get (2)
-Run:	../obj/ucw/unicode-t utf16_le_get
+Name:	utf16_le_get bad surrogates
 In:	1a dc 5f 2a 01 d8 01 d8 5f 2a 01 d8
 Out:	fffc 2a5f fffc fffc 2a5f fffc
-- 
2.39.2