--- /dev/null
+/*
+ * Check that the input is a proper UTF-8 file
+ *
+ * Written in 2017 by Martin Mares <mj@ucw.cz>
+ * and placed into public domain.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+typedef unsigned int uint;
+
+static void reject(const char *msg, uint arg)
+{
+ printf("Error: ");
+ printf(msg, arg);
+ putchar('\n');
+ exit(1);
+}
+
+int main(void)
+{
+ int c;
+ while ((c = getchar()) >= 0) {
+ if (c < 0x20) {
+ if (c != '\r' && c != '\n' && c != '\t' && c != 0x0c)
+ reject("ASCII control character %02x (only CR, LF, HT, FF allowed)", c);
+ } else if (c < 0x80) {
+ if (c == 0x7f)
+ reject("ASCII DEL not allowed", c);
+ } else if (c < 0xc0) {
+ reject("Unexpected continuation byte %02x", c);
+ } else if (c < 0xf8) {
+ uint bytes = 1 + (c >= 0xe0) + (c >= 0xf0);
+ uint x = c & (0x3f >> bytes);
+ for (uint i=0; i<bytes; i++) {
+ c = getchar();
+ if (c < 0x80 || c >= 0xc0)
+ reject("Incomplete multi-byte sequence at byte %02x", c);
+ x = (x << 6) | (c & 0x3f);
+ }
+
+ static const uint min_code[] = { 0, 0x80, 0x800, 0x10000 };
+ if (x < min_code[bytes])
+ reject("Non-minimalistic encoding of %06x", x);
+ if (x > 0x10ffff)
+ reject("Codepoint too high: %06x", x);
+
+ if (x >= 0x0080 && x <= 0x009f)
+ reject("C1 control character %04x", x);
+ if (x >= 0xd800 && x <= 0xdfff)
+ reject("Surrogate code-point %04x", x);
+ if ((x & 0xffff) >= 0xfffe)
+ reject("Non-character %06x", x);
+ if (x >= 0xe000 && x <= 0xf8ff ||
+ x >= 0xf0000 && x <= 0xffffd ||
+ x >= 0x100000 && x <= 0x10fffd)
+ reject("Private-use character %06x", x);
+ } else {
+ reject("Invalid byte %02x", c);
+ }
+ }
+
+ return 0;
+}
--- /dev/null
+#!/usr/bin/perl
+
+my @good = split /\n/, <<AMEN ;
+09 0d 0a 0c # allowed control
+33 31 34 31 35 39 32 3d 70 69 # ASCII
+73 74 c5 99 c3 ad 7a 6c c3 ad 6b 0a # Czech
+c2 a0 # first 2-byte
+df bf # last 2-byte
+e0 a0 80 # first 3-byte
+ef bf bd # last valid 3-byte
+f0 90 80 80 # first 4-byte
+f3 af bf bd # last non-private
+AMEN
+
+my @bad = split /\n/, <<AMEN ;
+1f # control
+7f # control
+80 # continuation
+9f # continuation
+f8 # invalid byte
+ff # invalid byte
+c1 bf # too small 2-byte
+c2 80 # C1 control
+e0 9f bf # too small 3-byte
+ef bf be # non-character
+ef bf bf # non-character
+f0 8f bf bf # too small 4-byte
+ee 80 80 # private plane 0
+ef a3 bf # private plane 0
+f3 bf bf bd # private plane F
+f4 8f bf bd # private plane 10
+f4 8f bf be # non-character
+f4 8f bf bf # non-character
+ed a4 91 # high surrogate
+ed b0 91 # low surrogate
+AMEN
+
+sub test {
+ my ($in, $outcome) = @_;
+ my ($hex, $cmt) = ($in =~ m{^(.*?)\s+#\s+(.*)$}) or die;
+ my $raw = $hex;
+ $raw =~ s{\s+}{}g;
+ $raw =~ s{([0-9a-fA-F]{2})}{chr hex $1}ge;
+ print "$hex ($cmt): ";
+ open my $p, '|-', './utf8-check' or die;
+ print $p $raw;
+ flush $p;
+ close $p;
+ if ($?) {
+ $outcome or die "Wrong answer\n";
+ } else {
+ !$outcome or die "OK, but should fail\n";
+ print "OK\n";
+ }
+}
+
+test($_, 0) for @good;
+test($_, 1) for @bad;