/*
* UCW Library: Reading and writing of UTF-8 on Fastbuf Streams
*
- * (c) 2001--2004 Martin Mares <mj@ucw.cz>
+ * (c) 2001--2015 Martin Mares <mj@ucw.cz>
* (c) 2004 Robert Spalek <robert@ucw.cz>
*
* This software may be freely distributed and used according to the terms
if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
goto wrong;
code = (code << 6) | (c & 0x3f);
+ if (code < 0x800)
+ goto wrong2;
}
else /* 2 bytes */
{
if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
goto wrong;
code = (code << 6) | (c & 0x3f);
+ if (code < 0x80)
+ goto wrong2;
}
return code;
- wrong:
+wrong:
if (c >= 0)
bungetc(b);
+wrong2:
return repl;
}
int c = bgetc(b);
int code;
int nr;
+ int limit;
if (c < 0x80) /* Includes EOF */
return c;
{
code = c & 0x1f;
nr = 1;
+ limit = 0x80;
}
else if (c < 0xf0)
{
code = c & 0x0f;
nr = 2;
+ limit = 0x800;
}
else if (c < 0xf8)
{
code = c & 0x07;
nr = 3;
+ limit = 1 << 16;
}
else if (c < 0xfc)
{
code = c & 0x03;
nr = 4;
+ limit = 1 << 21;
}
else if (c < 0xfe)
{
code = c & 0x01;
nr = 5;
+ limit = 1 << 26;
}
- else /* Too large, skip it */
- {
- while ((c = bgetc(b)) >= 0x80 && c < 0xc0)
- ;
- goto wrong;
- }
+ else /* Too large */
+ goto wrong2;
while (nr-- > 0)
{
if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
goto wrong;
code = (code << 6) | (c & 0x3f);
}
+ if (code < limit)
+ goto wrong2;
return code;
- wrong:
+wrong:
if (c >= 0)
bungetc(b);
+wrong2:
return repl;
}
/*
* UCW Library: Reading and writing of UTF-8 and UTF-16 on Fastbuf Streams
*
- * (c) 2001--2004 Martin Mares <mj@ucw.cz>
+ * (c) 2001--2015 Martin Mares <mj@ucw.cz>
* (c) 2004 Robert Spalek <robert@ucw.cz>
* (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
*
# Tests for the Unicode module
-Name: bput_utf8
+Name: bput_utf8 ASCII
Run: ../obj/ucw/ff-unicode-t bput_utf8
In: 0041 0048 004f 004a
Out: 41 48 4f 4a
-Name: bget_utf8_32
-Run: ../obj/ucw/ff-unicode-t bget_utf8_32
-In: fe 83 81
-Out: fffc
+Name: bput_utf8 BMP
+In: 00aa 01aa 02a5 05a5 0a5a 15a5 2a5a 5a5a a5a5
+Out: c2 aa c6 aa ca a5 d6 a5 e0 a9 9a e1 96 a5 e2 a9 9a e5 a9 9a ea 96 a5
+
+Name: bget_utf8 ASCII
+Run: ../obj/ucw/ff-unicode-t bget_utf8
+In: 41 48 4f 4a
+Out: 0041 0048 004f 004a
+
+Name: bget_utf8 BMP
+In: c2 aa c6 aa ca a5 d6 a5 e0 a9 9a e1 96 a5 e2 a9 9a e5 a9 9a ea 96 a5
+Out: 00aa 01aa 02a5 05a5 0a5a 15a5 2a5a 5a5a a5a5
+
+Name: bget_utf8 garbage
+In: 84 ff f9 f8 c2 aa 41
+Out: fffc fffc fffc fffc 00aa 0041
+
+Name: bget_utf8 denormalized
+In: c1 bf e0 9f bf
+Out: fffc fffc
+
+Name: bput_utf8_32
+Run: ../obj/ucw/ff-unicode-t bput_utf8_32
+In: 15a5a 2a5a5 5a5a5 a5a5a 15a5a5 2a5a5a 5a5a5a a5a5a5 15a5a5a 2a5a5a5 5a5a5a5 a5a5a5a 15a5a5a5 2a5a5a5a 5a5a5a5a
+Out: f0 95 a9 9a f0 aa 96 a5 f1 9a 96 a5 f2 a5 a9 9a f5 9a 96 a5 f8 8a a5 a9 9a f8 96 a5 a9 9a f8 a9 9a 96 a5 f9 96 a5 a9 9a fa a9 9a 96 a5 fc 85 a9 9a 96 a5 fc 8a 96 a5 a9 9a fc 95 a9 9a 96 a5 fc aa 96 a5 a9 9a fd 9a 96 a5 a9 9a
+
+Name: bget_utf8_32
+Run: ../obj/ucw/ff-unicode-t bget_utf8_32
+In: f0 95 a9 9a f0 aa 96 a5 f1 9a 96 a5 f2 a5 a9 9a f5 9a 96 a5 f8 8a a5 a9 9a f8 96 a5 a9 9a f8 a9 9a 96 a5 f9 96 a5 a9 9a fa a9 9a 96 a5 fc 85 a9 9a 96 a5 fc 8a 96 a5 a9 9a fc 95 a9 9a 96 a5 fc aa 96 a5 a9 9a fd 9a 96 a5 a9 9a
+Out: 15a5a 2a5a5 5a5a5 a5a5a 15a5a5 2a5a5a 5a5a5a a5a5a5 15a5a5a 2a5a5a5 5a5a5a5 a5a5a5a 15a5a5a5 2a5a5a5a 5a5a5a5a
+
+Name: bget_utf8_32 garbage
+In: fe 83 81
+Out: fffc fffc fffc
+
+Name: bget_utf8_32 denormalized
+In: c1 bf e0 9f bf f0 8f bf bf f8 87 bf bf bf fc 83 bf bf bf
+Out: fffc fffc fffc fffc fffc
Name: bput_utf16_be
Run: ../obj/ucw/ff-unicode-t bput_utf16_be
In: 0041 004a 2a5f feff 0000 10ffff ffff 10000
Out: 41 00 4a 00 5f 2a ff fe 00 00 ff db ff df ff ff 00 d8 00 dc
-Name: bget_utf16_be (1)
+Name: bget_utf16_be
Run: ../obj/ucw/ff-unicode-t bget_utf16_be
In: 00 41 00 4a 2a 5f fe ff 00 00 db ff df ff ff ff d8 00 dc 00
Out: 0041 004a 2a5f feff 0000 10ffff ffff 10000
-Name: bget_utf16_be (2)
+Name: bget_utf16_be bad surrogates
Run: ../obj/ucw/ff-unicode-t bget_utf16_be
In: dc 1a 2a 5f d8 01 d8 01 2a 5f d8 01
Out: fffc 2a5f fffc 2a5f fffc
-Name: bget_utf16_le (1)
+Name: bget_utf16_le
Run: ../obj/ucw/ff-unicode-t bget_utf16_le
In: 41 00 4a 00 5f 2a ff fe 00 00 ff db ff df ff ff 00 d8 00 dc
Out: 0041 004a 2a5f feff 0000 10ffff ffff 10000
-Name: bget_utf16_le (2)
+Name: bget_utf16_le bad surrogates
Run: ../obj/ucw/ff-unicode-t bget_utf16_le
In: 1a dc 5f 2a 01 d8 01 d8 5f 2a 01 d8
Out: fffc 2a5f fffc 2a5f fffc
}
#define UTF8_GET_NEXT if (unlikely((*p & 0xc0) != 0x80)) goto bad; u = (u << 6) | (*p++ & 0x3f)
+#define UTF8_CHECK_RANGE(r) if (unlikely(u < r)) goto bad
/**
* Decode a value from the range `[0, 0xFFFF]` (basic multilingual plane)
{
u &= 0x1f;
UTF8_GET_NEXT;
+ UTF8_CHECK_RANGE(0x80);
}
else if (likely(u < 0xf0))
{
u &= 0x0f;
UTF8_GET_NEXT;
UTF8_GET_NEXT;
+ UTF8_CHECK_RANGE(0x800);
}
else
goto bad;
static inline byte *utf8_32_get_repl(const byte *p, uint *uu, uint repl)
{
uint u = *p++;
+ uint limit;
if (u < 0x80)
;
else if (unlikely(u < 0xc0))
- {
- /* Incorrect byte sequence */
- bad:
- u = repl;
- }
+ goto bad;
else if (u < 0xe0)
{
u &= 0x1f;
+ limit = 0x80;
goto get1;
}
else if (u < 0xf0)
{
u &= 0x0f;
+ limit = 0x800;
goto get2;
}
else if (u < 0xf8)
{
u &= 0x07;
+ limit = 1 << 16;
goto get3;
}
else if (u < 0xfc)
{
u &= 0x03;
+ limit = 1 << 21;
goto get4;
}
else if (u < 0xfe)
{
u &= 0x01;
+ limit = 1 << 26;
UTF8_GET_NEXT;
get4: UTF8_GET_NEXT;
get3: UTF8_GET_NEXT;
get2: UTF8_GET_NEXT;
get1: UTF8_GET_NEXT;
+ if (unlikely(u < limit))
+ goto bad;
}
else
goto bad;
*uu = u;
return (byte *)p;
+
+bad:
+ /* Incorrect byte sequence */
+ *uu = repl;
+ return (byte *)p;
}
/**
# Tests for the Unicode module
-Name: utf8_put (1)
+Name: utf8_put ASCII
Run: ../obj/ucw/unicode-t utf8_put
In: 0041 0048 004f 004a
Out: 41 48 4f 4a
-Name: utf8_put (2)
-Run: ../obj/ucw/unicode-t utf8_put
+Name: utf8_put BMP
In: 00aa 01aa 02a5 05a5 0a5a 15a5 2a5a 5a5a a5a5
Out: c2 aa c6 aa ca a5 d6 a5 e0 a9 9a e1 96 a5 e2 a9 9a e5 a9 9a ea 96 a5
-Name: utf8_get (1)
+Name: utf8_get ASCII
Run: ../obj/ucw/unicode-t utf8_get
In: 41 48 4f 4a
Out: 0041 0048 004f 004a
-Name: utf8_get (2)
-Run: ../obj/ucw/unicode-t utf8_get
+Name: utf8_get BMP
In: c2 aa c6 aa ca a5 d6 a5 e0 a9 9a e1 96 a5 e2 a9 9a e5 a9 9a ea 96 a5
Out: 00aa 01aa 02a5 05a5 0a5a 15a5 2a5a 5a5a a5a5
-Name: utf8_get (3)
-Run: ../obj/ucw/unicode-t utf8_get
+Name: utf8_get garbage
In: 84 ff f9 f8 c2 aa 41
Out: fffc fffc fffc fffc 00aa 0041
+Name: utf8_get denormalized
+In: c1 bf e0 9f bf
+Out: fffc fffc
+
Name: utf8_32_put
Run: ../obj/ucw/unicode-t utf8_32_put
In: 15a5a 2a5a5 5a5a5 a5a5a 15a5a5 2a5a5a 5a5a5a a5a5a5 15a5a5a 2a5a5a5 5a5a5a5 a5a5a5a 15a5a5a5 2a5a5a5a 5a5a5a5a
Out: f0 95 a9 9a f0 aa 96 a5 f1 9a 96 a5 f2 a5 a9 9a f5 9a 96 a5 f8 8a a5 a9 9a f8 96 a5 a9 9a f8 a9 9a 96 a5 f9 96 a5 a9 9a fa a9 9a 96 a5 fc 85 a9 9a 96 a5 fc 8a 96 a5 a9 9a fc 95 a9 9a 96 a5 fc aa 96 a5 a9 9a fd 9a 96 a5 a9 9a
-Name: utf8_32_get (1)
+Name: utf8_32_get
Run: ../obj/ucw/unicode-t utf8_32_get
In: f0 95 a9 9a f0 aa 96 a5 f1 9a 96 a5 f2 a5 a9 9a f5 9a 96 a5 f8 8a a5 a9 9a f8 96 a5 a9 9a f8 a9 9a 96 a5 f9 96 a5 a9 9a fa a9 9a 96 a5 fc 85 a9 9a 96 a5 fc 8a 96 a5 a9 9a fc 95 a9 9a 96 a5 fc aa 96 a5 a9 9a fd 9a 96 a5 a9 9a
Out: 15a5a 2a5a5 5a5a5 a5a5a 15a5a5 2a5a5a 5a5a5a a5a5a5 15a5a5a 2a5a5a5 5a5a5a5 a5a5a5a 15a5a5a5 2a5a5a5a 5a5a5a5a
-Name: utf8_32_get (2)
-Run: ../obj/ucw/unicode-t utf8_32_get
+Name: utf8_32_get garbage
In: fe 83 81
Out: fffc fffc fffc
+Name: utf8_32_get denormalized
+In: c1 bf e0 9f bf f0 8f bf bf f8 87 bf bf bf fc 83 bf bf bf
+Out: fffc fffc fffc fffc fffc
+
Name: utf16_be_put
Run: ../obj/ucw/unicode-t utf16_be_put
In: 0041 004a 2a5f feff 0000 10ffff ffff 10000
Out: 00 41 00 4a 2a 5f fe ff 00 00 db ff df ff ff ff d8 00 dc 00
Name: utf16_le_put
-Run: ../obj/ucw/unicode-t utf16_le_put
In: 0041 004a 2a5f feff 0000 10ffff ffff 10000
Out: 41 00 4a 00 5f 2a ff fe 00 00 ff db ff df ff ff 00 d8 00 dc
-Name: utf16_be_get (1)
+Name: utf16_be_get
Run: ../obj/ucw/unicode-t utf16_be_get
In: 00 41 00 4a 2a 5f fe ff 00 00 db ff df ff ff ff d8 00 dc 00
Out: 0041 004a 2a5f feff 0000 10ffff ffff 10000
-Name: utf16_be_get (2)
-Run: ../obj/ucw/unicode-t utf16_be_get
+Name: utf16_be_get bad surrogates
In: dc 1a 2a 5f d8 01 d8 01 2a 5f d8 01
Out: fffc 2a5f fffc fffc 2a5f fffc
-Name: utf16_le_get (1)
+Name: utf16_le_get
Run: ../obj/ucw/unicode-t utf16_le_get
In: 41 00 4a 00 5f 2a ff fe 00 00 ff db ff df ff ff 00 d8 00 dc
Out: 0041 004a 2a5f feff 0000 10ffff ffff 10000
-Name: utf16_le_get (2)
-Run: ../obj/ucw/unicode-t utf16_le_get
+Name: utf16_le_get bad surrogates
In: 1a dc 5f 2a 01 d8 01 d8 5f 2a 01 d8
Out: fffc 2a5f fffc fffc 2a5f fffc