From e1b410f53b4e0f2088ff407f764037717800aa98 Mon Sep 17 00:00:00 2001 From: Martin Mares Date: Sun, 11 Jun 2017 23:28:54 +0200 Subject: [PATCH] utf8-check --- Makefile | 5 ++-- utf8-check.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++ utf8-check.t | 58 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 125 insertions(+), 3 deletions(-) create mode 100644 utf8-check.c create mode 100755 utf8-check.t diff --git a/Makefile b/Makefile index f7d486d..723702c 100644 --- a/Makefile +++ b/Makefile @@ -2,15 +2,14 @@ CC=gcc LD=gcc CFLAGS=-O2 -Wall -W -Wno-parentheses -Wstrict-prototypes -Wmissing-prototypes -Wundef -Wredundant-decls -std=gnu99 -all: - @echo "Please choose what to make:" - @grep '^[^ ]*:' Makefile | grep -v = +all: utf8-check parrot: parrot.c xclipcat: xclipcat.c xclipsend: xclipsend.c prefork: prefork.c pcap-tail: pcap-tail.c +utf8-check: utf8-check.c fft: fft.c fft: LDFLAGS+=-lm diff --git a/utf8-check.c b/utf8-check.c new file mode 100644 index 0000000..b44d504 --- /dev/null +++ b/utf8-check.c @@ -0,0 +1,65 @@ +/* + * Check that the input is a proper UTF-8 file + * + * Written in 2017 by Martin Mares + * and placed into public domain. + */ + +#include +#include + +typedef unsigned int uint; + +static void reject(const char *msg, uint arg) +{ + printf("Error: "); + printf(msg, arg); + putchar('\n'); + exit(1); +} + +int main(void) +{ + int c; + while ((c = getchar()) >= 0) { + if (c < 0x20) { + if (c != '\r' && c != '\n' && c != '\t' && c != 0x0c) + reject("ASCII control character %02x (only CR, LF, HT, FF allowed)", c); + } else if (c < 0x80) { + if (c == 0x7f) + reject("ASCII DEL not allowed", c); + } else if (c < 0xc0) { + reject("Unexpected continuation byte %02x", c); + } else if (c < 0xf8) { + uint bytes = 1 + (c >= 0xe0) + (c >= 0xf0); + uint x = c & (0x3f >> bytes); + for (uint i=0; i= 0xc0) + reject("Incomplete multi-byte sequence at byte %02x", c); + x = (x << 6) | (c & 0x3f); + } + + static const uint min_code[] = { 0, 0x80, 0x800, 0x10000 }; + if (x < min_code[bytes]) + reject("Non-minimalistic encoding of %06x", x); + if (x > 0x10ffff) + reject("Codepoint too high: %06x", x); + + if (x >= 0x0080 && x <= 0x009f) + reject("C1 control character %04x", x); + if (x >= 0xd800 && x <= 0xdfff) + reject("Surrogate code-point %04x", x); + if ((x & 0xffff) >= 0xfffe) + reject("Non-character %06x", x); + if (x >= 0xe000 && x <= 0xf8ff || + x >= 0xf0000 && x <= 0xffffd || + x >= 0x100000 && x <= 0x10fffd) + reject("Private-use character %06x", x); + } else { + reject("Invalid byte %02x", c); + } + } + + return 0; +} diff --git a/utf8-check.t b/utf8-check.t new file mode 100755 index 0000000..ee0acc1 --- /dev/null +++ b/utf8-check.t @@ -0,0 +1,58 @@ +#!/usr/bin/perl + +my @good = split /\n/, <