From e1b410f53b4e0f2088ff407f764037717800aa98 Mon Sep 17 00:00:00 2001
From: Martin Mares <mj@ucw.cz>
Date: Sun, 11 Jun 2017 23:28:54 +0200
Subject: [PATCH] utf8-check

---
 Makefile     |  5 ++--
 utf8-check.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 utf8-check.t | 58 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 125 insertions(+), 3 deletions(-)
 create mode 100644 utf8-check.c
 create mode 100755 utf8-check.t

diff --git a/Makefile b/Makefile
index f7d486d..723702c 100644
--- a/Makefile
+++ b/Makefile
@@ -2,15 +2,14 @@ CC=gcc
 LD=gcc
 CFLAGS=-O2 -Wall -W -Wno-parentheses -Wstrict-prototypes -Wmissing-prototypes -Wundef -Wredundant-decls -std=gnu99
 
-all:
-	@echo "Please choose what to make:"
-	@grep '^[^ 	]*:' Makefile | grep -v =
+all: utf8-check
 
 parrot: parrot.c
 xclipcat: xclipcat.c
 xclipsend: xclipsend.c
 prefork: prefork.c
 pcap-tail: pcap-tail.c
+utf8-check: utf8-check.c
 
 fft: fft.c
 fft: LDFLAGS+=-lm
diff --git a/utf8-check.c b/utf8-check.c
new file mode 100644
index 0000000..b44d504
--- /dev/null
+++ b/utf8-check.c
@@ -0,0 +1,65 @@
+/*
+ *	Check that the input is a proper UTF-8 file
+ *
+ *	Written in 2017 by Martin Mares <mj@ucw.cz>
+ *	and placed into public domain.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+typedef unsigned int uint;
+
+static void reject(const char *msg, uint arg)
+{
+	printf("Error: ");
+	printf(msg, arg);
+	putchar('\n');
+	exit(1);
+}
+
+int main(void)
+{
+	int c;
+	while ((c = getchar()) >= 0) {
+		if (c < 0x20) {
+			if (c != '\r' && c != '\n' && c != '\t' && c != 0x0c)
+				reject("ASCII control character %02x (only CR, LF, HT, FF allowed)", c);
+		} else if (c < 0x80) {
+			if (c == 0x7f)
+				reject("ASCII DEL not allowed", c);
+		} else if (c < 0xc0) {
+			reject("Unexpected continuation byte %02x", c);
+		} else if (c < 0xf8) {
+			uint bytes = 1 + (c >= 0xe0) + (c >= 0xf0);
+			uint x = c & (0x3f >> bytes);
+			for (uint i=0; i<bytes; i++) {
+				c = getchar();
+				if (c < 0x80 || c >= 0xc0)
+					reject("Incomplete multi-byte sequence at byte %02x", c);
+				x = (x << 6) | (c & 0x3f);
+			}
+
+			static const uint min_code[] = { 0, 0x80, 0x800, 0x10000 };
+			if (x < min_code[bytes])
+				reject("Non-minimalistic encoding of %06x", x);
+			if (x > 0x10ffff)
+				reject("Codepoint too high: %06x", x);
+
+			if (x >= 0x0080 && x <= 0x009f)
+				reject("C1 control character %04x", x);
+			if (x >= 0xd800 && x <= 0xdfff)
+				reject("Surrogate code-point %04x", x);
+			if ((x & 0xffff) >= 0xfffe)
+				reject("Non-character %06x", x);
+			if (x >= 0xe000 && x <= 0xf8ff ||
+			    x >= 0xf0000 && x <= 0xffffd ||
+			    x >= 0x100000 && x <= 0x10fffd)
+				reject("Private-use character %06x", x);
+		} else {
+			reject("Invalid byte %02x", c);
+		}
+	}
+
+	return 0;
+}
diff --git a/utf8-check.t b/utf8-check.t
new file mode 100755
index 0000000..ee0acc1
--- /dev/null
+++ b/utf8-check.t
@@ -0,0 +1,58 @@
+#!/usr/bin/perl
+
+my @good = split /\n/, <<AMEN ;
+09 0d 0a 0c				# allowed control
+33 31 34 31 35 39 32 3d 70 69		# ASCII
+73 74 c5 99 c3 ad 7a 6c c3 ad 6b 0a	# Czech
+c2 a0			# first 2-byte
+df bf			# last 2-byte
+e0 a0 80		# first 3-byte
+ef bf bd		# last valid 3-byte
+f0 90 80 80		# first 4-byte
+f3 af bf bd		# last non-private
+AMEN
+
+my @bad = split /\n/, <<AMEN ;
+1f			# control
+7f			# control
+80			# continuation
+9f			# continuation
+f8			# invalid byte
+ff			# invalid byte
+c1 bf			# too small 2-byte
+c2 80			# C1 control
+e0 9f bf		# too small 3-byte
+ef bf be		# non-character
+ef bf bf		# non-character
+f0 8f bf bf		# too small 4-byte
+ee 80 80		# private plane 0
+ef a3 bf		# private plane 0
+f3 bf bf bd		# private plane F
+f4 8f bf bd		# private plane 10
+f4 8f bf be		# non-character
+f4 8f bf bf		# non-character
+ed a4 91		# high surrogate
+ed b0 91		# low surrogate
+AMEN
+
+sub test {
+	my ($in, $outcome) = @_;
+	my ($hex, $cmt) = ($in =~ m{^(.*?)\s+#\s+(.*)$}) or die;
+	my $raw = $hex;
+	$raw =~ s{\s+}{}g;
+	$raw =~ s{([0-9a-fA-F]{2})}{chr hex $1}ge;
+	print "$hex ($cmt): ";
+	open my $p, '|-', './utf8-check' or die;
+	print $p $raw;
+	flush $p;
+	close $p;
+	if ($?) {
+		$outcome or die "Wrong answer\n";
+	} else {
+		!$outcome or die "OK, but should fail\n";
+		print "OK\n";
+	}
+}
+
+test($_, 0) for @good;
+test($_, 1) for @bad;
-- 
2.39.5