From 4905e7908d5a7d37ac1b0d0ac18243b9ae6a381d Mon Sep 17 00:00:00 2001 From: Martin Mares Date: Wed, 8 Jul 2015 23:58:21 +0200 Subject: [PATCH] Unicode: Reject denormalized UTF-8 sequences When there are multiple possible encodings of a single Unicode codepoint, the standard requires to use the shortest one. Not checking this requirement on input has been observed to cause weird security problems in some software, so better be careful. --- ucw/ff-unicode.c | 28 +++++++++++++++++--------- ucw/ff-unicode.h | 2 +- ucw/ff-unicode.t | 52 +++++++++++++++++++++++++++++++++++++++--------- ucw/unicode.h | 22 +++++++++++++++----- ucw/unicode.t | 37 +++++++++++++++++----------------- 5 files changed, 99 insertions(+), 42 deletions(-) diff --git a/ucw/ff-unicode.c b/ucw/ff-unicode.c index 20d2cbdd..e0faa0c8 100644 --- a/ucw/ff-unicode.c +++ b/ucw/ff-unicode.c @@ -1,7 +1,7 @@ /* * UCW Library: Reading and writing of UTF-8 on Fastbuf Streams * - * (c) 2001--2004 Martin Mares + * (c) 2001--2015 Martin Mares * (c) 2004 Robert Spalek * * This software may be freely distributed and used according to the terms @@ -41,6 +41,8 @@ bget_utf8_slow(struct fastbuf *b, uint repl) if ((c = bgetc(b)) < 0x80 || c >= 0xc0) goto wrong; code = (code << 6) | (c & 0x3f); + if (code < 0x800) + goto wrong2; } else /* 2 bytes */ { @@ -48,12 +50,15 @@ bget_utf8_slow(struct fastbuf *b, uint repl) if ((c = bgetc(b)) < 0x80 || c >= 0xc0) goto wrong; code = (code << 6) | (c & 0x3f); + if (code < 0x80) + goto wrong2; } return code; - wrong: +wrong: if (c >= 0) bungetc(b); +wrong2: return repl; } @@ -63,6 +68,7 @@ bget_utf8_32_slow(struct fastbuf *b, uint repl) int c = bgetc(b); int code; int nr; + int limit; if (c < 0x80) /* Includes EOF */ return c; @@ -72,44 +78,48 @@ bget_utf8_32_slow(struct fastbuf *b, uint repl) { code = c & 0x1f; nr = 1; + limit = 0x80; } else if (c < 0xf0) { code = c & 0x0f; nr = 2; + limit = 0x800; } else if (c < 0xf8) { code = c & 0x07; nr = 3; + limit = 1 << 16; } else if (c < 0xfc) { code = c & 0x03; nr = 4; + limit = 1 << 21; } else if (c < 0xfe) { code = c & 0x01; nr = 5; + limit = 1 << 26; } - else /* Too large, skip it */ - { - while ((c = bgetc(b)) >= 0x80 && c < 0xc0) - ; - goto wrong; - } + else /* Too large */ + goto wrong2; while (nr-- > 0) { if ((c = bgetc(b)) < 0x80 || c >= 0xc0) goto wrong; code = (code << 6) | (c & 0x3f); } + if (code < limit) + goto wrong2; return code; - wrong: +wrong: if (c >= 0) bungetc(b); +wrong2: return repl; } diff --git a/ucw/ff-unicode.h b/ucw/ff-unicode.h index 8341eb8b..79a0c493 100644 --- a/ucw/ff-unicode.h +++ b/ucw/ff-unicode.h @@ -1,7 +1,7 @@ /* * UCW Library: Reading and writing of UTF-8 and UTF-16 on Fastbuf Streams * - * (c) 2001--2004 Martin Mares + * (c) 2001--2015 Martin Mares * (c) 2004 Robert Spalek * (c) 2007--2008 Pavel Charvat * diff --git a/ucw/ff-unicode.t b/ucw/ff-unicode.t index ec089a66..82b8518c 100644 --- a/ucw/ff-unicode.t +++ b/ucw/ff-unicode.t @@ -1,14 +1,48 @@ # Tests for the Unicode module -Name: bput_utf8 +Name: bput_utf8 ASCII Run: ../obj/ucw/ff-unicode-t bput_utf8 In: 0041 0048 004f 004a Out: 41 48 4f 4a -Name: bget_utf8_32 -Run: ../obj/ucw/ff-unicode-t bget_utf8_32 -In: fe 83 81 -Out: fffc +Name: bput_utf8 BMP +In: 00aa 01aa 02a5 05a5 0a5a 15a5 2a5a 5a5a a5a5 +Out: c2 aa c6 aa ca a5 d6 a5 e0 a9 9a e1 96 a5 e2 a9 9a e5 a9 9a ea 96 a5 + +Name: bget_utf8 ASCII +Run: ../obj/ucw/ff-unicode-t bget_utf8 +In: 41 48 4f 4a +Out: 0041 0048 004f 004a + +Name: bget_utf8 BMP +In: c2 aa c6 aa ca a5 d6 a5 e0 a9 9a e1 96 a5 e2 a9 9a e5 a9 9a ea 96 a5 +Out: 00aa 01aa 02a5 05a5 0a5a 15a5 2a5a 5a5a a5a5 + +Name: bget_utf8 garbage +In: 84 ff f9 f8 c2 aa 41 +Out: fffc fffc fffc fffc 00aa 0041 + +Name: bget_utf8 denormalized +In: c1 bf e0 9f bf +Out: fffc fffc + +Name: bput_utf8_32 +Run: ../obj/ucw/ff-unicode-t bput_utf8_32 +In: 15a5a 2a5a5 5a5a5 a5a5a 15a5a5 2a5a5a 5a5a5a a5a5a5 15a5a5a 2a5a5a5 5a5a5a5 a5a5a5a 15a5a5a5 2a5a5a5a 5a5a5a5a +Out: f0 95 a9 9a f0 aa 96 a5 f1 9a 96 a5 f2 a5 a9 9a f5 9a 96 a5 f8 8a a5 a9 9a f8 96 a5 a9 9a f8 a9 9a 96 a5 f9 96 a5 a9 9a fa a9 9a 96 a5 fc 85 a9 9a 96 a5 fc 8a 96 a5 a9 9a fc 95 a9 9a 96 a5 fc aa 96 a5 a9 9a fd 9a 96 a5 a9 9a + +Name: bget_utf8_32 +Run: ../obj/ucw/ff-unicode-t bget_utf8_32 +In: f0 95 a9 9a f0 aa 96 a5 f1 9a 96 a5 f2 a5 a9 9a f5 9a 96 a5 f8 8a a5 a9 9a f8 96 a5 a9 9a f8 a9 9a 96 a5 f9 96 a5 a9 9a fa a9 9a 96 a5 fc 85 a9 9a 96 a5 fc 8a 96 a5 a9 9a fc 95 a9 9a 96 a5 fc aa 96 a5 a9 9a fd 9a 96 a5 a9 9a +Out: 15a5a 2a5a5 5a5a5 a5a5a 15a5a5 2a5a5a 5a5a5a a5a5a5 15a5a5a 2a5a5a5 5a5a5a5 a5a5a5a 15a5a5a5 2a5a5a5a 5a5a5a5a + +Name: bget_utf8_32 garbage +In: fe 83 81 +Out: fffc fffc fffc + +Name: bget_utf8_32 denormalized +In: c1 bf e0 9f bf f0 8f bf bf f8 87 bf bf bf fc 83 bf bf bf +Out: fffc fffc fffc fffc fffc Name: bput_utf16_be Run: ../obj/ucw/ff-unicode-t bput_utf16_be @@ -20,22 +54,22 @@ Run: ../obj/ucw/ff-unicode-t bput_utf16_le In: 0041 004a 2a5f feff 0000 10ffff ffff 10000 Out: 41 00 4a 00 5f 2a ff fe 00 00 ff db ff df ff ff 00 d8 00 dc -Name: bget_utf16_be (1) +Name: bget_utf16_be Run: ../obj/ucw/ff-unicode-t bget_utf16_be In: 00 41 00 4a 2a 5f fe ff 00 00 db ff df ff ff ff d8 00 dc 00 Out: 0041 004a 2a5f feff 0000 10ffff ffff 10000 -Name: bget_utf16_be (2) +Name: bget_utf16_be bad surrogates Run: ../obj/ucw/ff-unicode-t bget_utf16_be In: dc 1a 2a 5f d8 01 d8 01 2a 5f d8 01 Out: fffc 2a5f fffc 2a5f fffc -Name: bget_utf16_le (1) +Name: bget_utf16_le Run: ../obj/ucw/ff-unicode-t bget_utf16_le In: 41 00 4a 00 5f 2a ff fe 00 00 ff db ff df ff ff 00 d8 00 dc Out: 0041 004a 2a5f feff 0000 10ffff ffff 10000 -Name: bget_utf16_le (2) +Name: bget_utf16_le bad surrogates Run: ../obj/ucw/ff-unicode-t bget_utf16_le In: 1a dc 5f 2a 01 d8 01 d8 5f 2a 01 d8 Out: fffc 2a5f fffc 2a5f fffc diff --git a/ucw/unicode.h b/ucw/unicode.h index a71b4baa..dd3d35b0 100644 --- a/ucw/unicode.h +++ b/ucw/unicode.h @@ -89,6 +89,7 @@ put1: *p++ = 0x80 | (u & 0x3f); } #define UTF8_GET_NEXT if (unlikely((*p & 0xc0) != 0x80)) goto bad; u = (u << 6) | (*p++ & 0x3f) +#define UTF8_CHECK_RANGE(r) if (unlikely(u < r)) goto bad /** * Decode a value from the range `[0, 0xFFFF]` (basic multilingual plane) @@ -109,12 +110,14 @@ static inline byte *utf8_get_repl(const byte *p, uint *uu, uint repl) { u &= 0x1f; UTF8_GET_NEXT; + UTF8_CHECK_RANGE(0x80); } else if (likely(u < 0xf0)) { u &= 0x0f; UTF8_GET_NEXT; UTF8_GET_NEXT; + UTF8_CHECK_RANGE(0x800); } else goto bad; @@ -129,47 +132,56 @@ static inline byte *utf8_get_repl(const byte *p, uint *uu, uint repl) static inline byte *utf8_32_get_repl(const byte *p, uint *uu, uint repl) { uint u = *p++; + uint limit; if (u < 0x80) ; else if (unlikely(u < 0xc0)) - { - /* Incorrect byte sequence */ - bad: - u = repl; - } + goto bad; else if (u < 0xe0) { u &= 0x1f; + limit = 0x80; goto get1; } else if (u < 0xf0) { u &= 0x0f; + limit = 0x800; goto get2; } else if (u < 0xf8) { u &= 0x07; + limit = 1 << 16; goto get3; } else if (u < 0xfc) { u &= 0x03; + limit = 1 << 21; goto get4; } else if (u < 0xfe) { u &= 0x01; + limit = 1 << 26; UTF8_GET_NEXT; get4: UTF8_GET_NEXT; get3: UTF8_GET_NEXT; get2: UTF8_GET_NEXT; get1: UTF8_GET_NEXT; + if (unlikely(u < limit)) + goto bad; } else goto bad; *uu = u; return (byte *)p; + +bad: + /* Incorrect byte sequence */ + *uu = repl; + return (byte *)p; } /** diff --git a/ucw/unicode.t b/ucw/unicode.t index 94e55f29..f239f32a 100644 --- a/ucw/unicode.t +++ b/ucw/unicode.t @@ -1,71 +1,72 @@ # Tests for the Unicode module -Name: utf8_put (1) +Name: utf8_put ASCII Run: ../obj/ucw/unicode-t utf8_put In: 0041 0048 004f 004a Out: 41 48 4f 4a -Name: utf8_put (2) -Run: ../obj/ucw/unicode-t utf8_put +Name: utf8_put BMP In: 00aa 01aa 02a5 05a5 0a5a 15a5 2a5a 5a5a a5a5 Out: c2 aa c6 aa ca a5 d6 a5 e0 a9 9a e1 96 a5 e2 a9 9a e5 a9 9a ea 96 a5 -Name: utf8_get (1) +Name: utf8_get ASCII Run: ../obj/ucw/unicode-t utf8_get In: 41 48 4f 4a Out: 0041 0048 004f 004a -Name: utf8_get (2) -Run: ../obj/ucw/unicode-t utf8_get +Name: utf8_get BMP In: c2 aa c6 aa ca a5 d6 a5 e0 a9 9a e1 96 a5 e2 a9 9a e5 a9 9a ea 96 a5 Out: 00aa 01aa 02a5 05a5 0a5a 15a5 2a5a 5a5a a5a5 -Name: utf8_get (3) -Run: ../obj/ucw/unicode-t utf8_get +Name: utf8_get garbage In: 84 ff f9 f8 c2 aa 41 Out: fffc fffc fffc fffc 00aa 0041 +Name: utf8_get denormalized +In: c1 bf e0 9f bf +Out: fffc fffc + Name: utf8_32_put Run: ../obj/ucw/unicode-t utf8_32_put In: 15a5a 2a5a5 5a5a5 a5a5a 15a5a5 2a5a5a 5a5a5a a5a5a5 15a5a5a 2a5a5a5 5a5a5a5 a5a5a5a 15a5a5a5 2a5a5a5a 5a5a5a5a Out: f0 95 a9 9a f0 aa 96 a5 f1 9a 96 a5 f2 a5 a9 9a f5 9a 96 a5 f8 8a a5 a9 9a f8 96 a5 a9 9a f8 a9 9a 96 a5 f9 96 a5 a9 9a fa a9 9a 96 a5 fc 85 a9 9a 96 a5 fc 8a 96 a5 a9 9a fc 95 a9 9a 96 a5 fc aa 96 a5 a9 9a fd 9a 96 a5 a9 9a -Name: utf8_32_get (1) +Name: utf8_32_get Run: ../obj/ucw/unicode-t utf8_32_get In: f0 95 a9 9a f0 aa 96 a5 f1 9a 96 a5 f2 a5 a9 9a f5 9a 96 a5 f8 8a a5 a9 9a f8 96 a5 a9 9a f8 a9 9a 96 a5 f9 96 a5 a9 9a fa a9 9a 96 a5 fc 85 a9 9a 96 a5 fc 8a 96 a5 a9 9a fc 95 a9 9a 96 a5 fc aa 96 a5 a9 9a fd 9a 96 a5 a9 9a Out: 15a5a 2a5a5 5a5a5 a5a5a 15a5a5 2a5a5a 5a5a5a a5a5a5 15a5a5a 2a5a5a5 5a5a5a5 a5a5a5a 15a5a5a5 2a5a5a5a 5a5a5a5a -Name: utf8_32_get (2) -Run: ../obj/ucw/unicode-t utf8_32_get +Name: utf8_32_get garbage In: fe 83 81 Out: fffc fffc fffc +Name: utf8_32_get denormalized +In: c1 bf e0 9f bf f0 8f bf bf f8 87 bf bf bf fc 83 bf bf bf +Out: fffc fffc fffc fffc fffc + Name: utf16_be_put Run: ../obj/ucw/unicode-t utf16_be_put In: 0041 004a 2a5f feff 0000 10ffff ffff 10000 Out: 00 41 00 4a 2a 5f fe ff 00 00 db ff df ff ff ff d8 00 dc 00 Name: utf16_le_put -Run: ../obj/ucw/unicode-t utf16_le_put In: 0041 004a 2a5f feff 0000 10ffff ffff 10000 Out: 41 00 4a 00 5f 2a ff fe 00 00 ff db ff df ff ff 00 d8 00 dc -Name: utf16_be_get (1) +Name: utf16_be_get Run: ../obj/ucw/unicode-t utf16_be_get In: 00 41 00 4a 2a 5f fe ff 00 00 db ff df ff ff ff d8 00 dc 00 Out: 0041 004a 2a5f feff 0000 10ffff ffff 10000 -Name: utf16_be_get (2) -Run: ../obj/ucw/unicode-t utf16_be_get +Name: utf16_be_get bad surrogates In: dc 1a 2a 5f d8 01 d8 01 2a 5f d8 01 Out: fffc 2a5f fffc fffc 2a5f fffc -Name: utf16_le_get (1) +Name: utf16_le_get Run: ../obj/ucw/unicode-t utf16_le_get In: 41 00 4a 00 5f 2a ff fe 00 00 ff db ff df ff ff 00 d8 00 dc Out: 0041 004a 2a5f feff 0000 10ffff ffff 10000 -Name: utf16_le_get (2) -Run: ../obj/ucw/unicode-t utf16_le_get +Name: utf16_le_get bad surrogates In: 1a dc 5f 2a 01 d8 01 d8 5f 2a 01 d8 Out: fffc 2a5f fffc fffc 2a5f fffc -- 2.39.2