From: Pavel Charvat Date: Fri, 29 Dec 2017 11:58:52 +0000 (+0100) Subject: UTF-8: Added tests. X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=7a8495df32a6d9170f957c9eefe9f37999c62368;p=libucw.git UTF-8: Added tests. --- diff --git a/ucw/Makefile b/ucw/Makefile index dfdc9fbe..39685763 100644 --- a/ucw/Makefile +++ b/ucw/Makefile @@ -45,7 +45,7 @@ LIBUCW_MAIN_INCLUDES= \ lib.h log.h tbf.h threads.h time.h \ alloc.h mempool.h eltpool.h \ clists.h slists.h simple-lists.h \ - string.h stkstring.h unicode.h varint.h chartype.h regex.h \ + string.h stkstring.h unicode.h unicode-gen.h varint.h chartype.h regex.h \ wildmatch.h \ unaligned.h \ bbuf.h gbuf.h gary.h bitarray.h bitsig.h \ diff --git a/ucw/ff-unicode.c b/ucw/ff-unicode.c index de6df9d7..202a751f 100644 --- a/ucw/ff-unicode.c +++ b/ucw/ff-unicode.c @@ -3,6 +3,7 @@ * * (c) 2001--2015 Martin Mares * (c) 2004 Robert Spalek + * (c) 2017 Pavel Charvat * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. @@ -158,8 +159,8 @@ bput_utf16_le_slow(struct fastbuf *b, uint u) int main(int argc, char **argv) { #define FUNCS \ - F(BGET_UTF8) F(BGET_UTF8_32) F(BGET_UTF16_BE) F(BGET_UTF16_LE) \ - F(BPUT_UTF8) F(BPUT_UTF8_32) F(BPUT_UTF16_BE) F(BPUT_UTF16_LE) + F(BGET_UTF8) F(BGET_UTF8_FULL) F(BGET_UTF8_32) F(BGET_UTF16_BE) F(BGET_UTF16_LE) \ + F(BPUT_UTF8) F(BPUT_UTF8_FULL) F(BPUT_UTF8_32) F(BPUT_UTF16_BE) F(BPUT_UTF16_LE) enum { #define F(x) FUNC_##x, @@ -199,6 +200,9 @@ int main(int argc, char **argv) case FUNC_BGET_UTF8: u = bget_utf8_slow(b, UNI_REPLACEMENT); break; + case FUNC_BGET_UTF8_FULL: + u = bget_utf8_full_slow(b, UNI_REPLACEMENT); + break; case FUNC_BGET_UTF8_32: u = bget_utf8_32_slow(b, UNI_REPLACEMENT); break; @@ -225,6 +229,9 @@ int main(int argc, char **argv) case FUNC_BPUT_UTF8: bput_utf8_slow(b, u); break; + case FUNC_BPUT_UTF8_FULL: + bput_utf8_full_slow(b, u); + break; case FUNC_BPUT_UTF8_32: bput_utf8_32_slow(b, u); break; diff --git a/ucw/ff-unicode.h b/ucw/ff-unicode.h index e42b3102..5a32919b 100644 --- a/ucw/ff-unicode.h +++ b/ucw/ff-unicode.h @@ -3,7 +3,7 @@ * * (c) 2001--2015 Martin Mares * (c) 2004 Robert Spalek - * (c) 2007--2008 Pavel Charvat + * (c) 2007--2017 Pavel Charvat * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. diff --git a/ucw/ff-unicode.t b/ucw/ff-unicode.t index 82b8518c..9814a92e 100644 --- a/ucw/ff-unicode.t +++ b/ucw/ff-unicode.t @@ -26,6 +26,24 @@ Name: bget_utf8 denormalized In: c1 bf e0 9f bf Out: fffc fffc +Name: bput_utf8_full +Run: ../obj/ucw/ff-unicode-t bput_utf8_full +In: 15a5a 2a5a5 5a5a5 a5a5a 10ffff +Out: f0 95 a9 9a f0 aa 96 a5 f1 9a 96 a5 f2 a5 a9 9a f4 8f bf bf + +Name: bget_utf8_full +Run: ../obj/ucw/ff-unicode-t bget_utf8_full +In: f0 95 a9 9a f0 aa 96 a5 f1 9a 96 a5 f2 a5 a9 9a f4 8f bf bf +Out: 15a5a 2a5a5 5a5a5 a5a5a 10ffff + +Name: bget_utf8_full garbage +In: fe 83 81 f4 90 80 80 +Out: fffc fffc fffc fffc + +Name: bget_utf8_full denormalized +In: c1 bf e0 9f bf f0 8f bf bf +Out: fffc fffc fffc + Name: bput_utf8_32 Run: ../obj/ucw/ff-unicode-t bput_utf8_32 In: 15a5a 2a5a5 5a5a5 a5a5a 15a5a5 2a5a5a 5a5a5a a5a5a5 15a5a5a 2a5a5a5 5a5a5a5 a5a5a5a 15a5a5a5 2a5a5a5a 5a5a5a5a diff --git a/ucw/unicode-gen.h b/ucw/unicode-gen.h index 6e10b8a9..3f462407 100644 --- a/ucw/unicode-gen.h +++ b/ucw/unicode-gen.h @@ -1,3 +1,12 @@ +/* + * UCW Library -- Generator of UTF-8 functions (for internal usage only, don't include it directly) + * + * (c) 2017 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + #if defined(UNI_WANT_UTF8_32) #define UNI_MAX_UTF8_BYTES 6 #elif defined(UNI_WANT_UTF8_FULL) @@ -60,6 +69,8 @@ put1: ASSERT(0); } +#undef UNI_PUT_NEXT + #endif /* Reading UTF-8 */ @@ -151,8 +162,12 @@ bad: UNI_GIVE_OK; } +#undef UNI_GET_NEXT + #endif +/* Clean macros before next usage */ + #undef UNI_WANT_PUT_UTF8 #undef UNI_WANT_GET_UTF8 @@ -167,6 +182,3 @@ bad: #undef UNI_GIVE_SKIPC #undef UNI_GIVE_OK #undef UNI_GIVE_BAD - -#undef UNI_PUT_NEXT -#undef UNI_GET_NEXT diff --git a/ucw/unicode.c b/ucw/unicode.c index 8615f122..7b558715 100644 --- a/ucw/unicode.c +++ b/ucw/unicode.c @@ -3,6 +3,7 @@ * * (c) 1997--2004 Martin Mares * (c) 2003 Robert Spalek + * (c) 2017 Pavel Charvat * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. @@ -46,8 +47,8 @@ int main(int argc, char **argv) byte buf[256]; #define FUNCS \ - F(UTF8_GET) F(UTF8_32_GET) F(UTF16_BE_GET) F(UTF16_LE_GET) \ - F(UTF8_PUT) F(UTF8_32_PUT) F(UTF16_BE_PUT) F(UTF16_LE_PUT) + F(UTF8_GET) F(UTF8_FULL_GET) F(UTF8_32_GET) F(UTF16_BE_GET) F(UTF16_LE_GET) \ + F(UTF8_PUT) F(UTF8_FULL_PUT) F(UTF8_32_PUT) F(UTF16_BE_PUT) F(UTF16_LE_PUT) enum { #define F(x) FUNC_##x, @@ -88,6 +89,9 @@ int main(int argc, char **argv) case FUNC_UTF8_GET: p = utf8_get(p, &u); break; + case FUNC_UTF8_FULL_GET: + p = utf8_full_get(p, &u); + break; case FUNC_UTF8_32_GET: p = utf8_32_get(p, &u); break; @@ -116,6 +120,9 @@ int main(int argc, char **argv) case FUNC_UTF8_PUT: p = utf8_put(p, u); break; + case FUNC_UTF8_FULL_PUT: + p = utf8_full_put(p, u); + break; case FUNC_UTF8_32_PUT: p = utf8_32_put(p, u); break; diff --git a/ucw/unicode.h b/ucw/unicode.h index b35531cd..e6ad42ab 100644 --- a/ucw/unicode.h +++ b/ucw/unicode.h @@ -3,7 +3,7 @@ * * (c) 1997--2004 Martin Mares * (c) 2004 Robert Spalek - * (c) 2007 Pavel Charvat + * (c) 2007--2017 Pavel Charvat * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. diff --git a/ucw/unicode.t b/ucw/unicode.t index 9a2e2adf..9c07fc44 100644 --- a/ucw/unicode.t +++ b/ucw/unicode.t @@ -26,6 +26,24 @@ Name: utf8_get denormalized In: c1 bf e0 9f bf Out: fffc fffc +Name: utf8_full_put +Run: ../obj/ucw/unicode-t utf8_full_put +In: 15a5a 2a5a5 5a5a5 a5a5a 10ffff +Out: f0 95 a9 9a f0 aa 96 a5 f1 9a 96 a5 f2 a5 a9 9a f4 8f bf bf + +Name: utf8_full_get +Run: ../obj/ucw/unicode-t utf8_full_get +In: f0 95 a9 9a f0 aa 96 a5 f1 9a 96 a5 f2 a5 a9 9a f4 8f bf bf +Out: 15a5a 2a5a5 5a5a5 a5a5a 10ffff + +Name: utf8_full_get garbage +In: fe 83 81 f4 90 80 80 +Out: fffc fffc fffc fffc + +Name: utf8_full_get denormalized +In: c1 bf e0 9f bf f0 8f bf bf +Out: fffc fffc fffc + Name: utf8_32_put Run: ../obj/ucw/unicode-t utf8_32_put In: 15a5a 2a5a5 5a5a5 a5a5a 15a5a5 2a5a5a 5a5a5a a5a5a5 15a5a5a 2a5a5a5 5a5a5a5 a5a5a5a 15a5a5a5 2a5a5a5a 5a5a5a5a