]> mj.ucw.cz Git - misc.git/commitdiff
utf8-check
authorMartin Mares <mj@ucw.cz>
Sun, 11 Jun 2017 21:28:54 +0000 (23:28 +0200)
committerMartin Mares <mj@ucw.cz>
Sun, 11 Jun 2017 21:28:54 +0000 (23:28 +0200)
Makefile
utf8-check.c [new file with mode: 0644]
utf8-check.t [new file with mode: 0755]

index f7d486df9981e65bafcb65c0abf3f878347ca030..723702cc685e0339503e2c0cf87545af391abb71 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -2,15 +2,14 @@ CC=gcc
 LD=gcc
 CFLAGS=-O2 -Wall -W -Wno-parentheses -Wstrict-prototypes -Wmissing-prototypes -Wundef -Wredundant-decls -std=gnu99
 
-all:
-       @echo "Please choose what to make:"
-       @grep '^[^      ]*:' Makefile | grep -v =
+all: utf8-check
 
 parrot: parrot.c
 xclipcat: xclipcat.c
 xclipsend: xclipsend.c
 prefork: prefork.c
 pcap-tail: pcap-tail.c
+utf8-check: utf8-check.c
 
 fft: fft.c
 fft: LDFLAGS+=-lm
diff --git a/utf8-check.c b/utf8-check.c
new file mode 100644 (file)
index 0000000..b44d504
--- /dev/null
@@ -0,0 +1,65 @@
+/*
+ *     Check that the input is a proper UTF-8 file
+ *
+ *     Written in 2017 by Martin Mares <mj@ucw.cz>
+ *     and placed into public domain.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+typedef unsigned int uint;
+
+static void reject(const char *msg, uint arg)
+{
+       printf("Error: ");
+       printf(msg, arg);
+       putchar('\n');
+       exit(1);
+}
+
+int main(void)
+{
+       int c;
+       while ((c = getchar()) >= 0) {
+               if (c < 0x20) {
+                       if (c != '\r' && c != '\n' && c != '\t' && c != 0x0c)
+                               reject("ASCII control character %02x (only CR, LF, HT, FF allowed)", c);
+               } else if (c < 0x80) {
+                       if (c == 0x7f)
+                               reject("ASCII DEL not allowed", c);
+               } else if (c < 0xc0) {
+                       reject("Unexpected continuation byte %02x", c);
+               } else if (c < 0xf8) {
+                       uint bytes = 1 + (c >= 0xe0) + (c >= 0xf0);
+                       uint x = c & (0x3f >> bytes);
+                       for (uint i=0; i<bytes; i++) {
+                               c = getchar();
+                               if (c < 0x80 || c >= 0xc0)
+                                       reject("Incomplete multi-byte sequence at byte %02x", c);
+                               x = (x << 6) | (c & 0x3f);
+                       }
+
+                       static const uint min_code[] = { 0, 0x80, 0x800, 0x10000 };
+                       if (x < min_code[bytes])
+                               reject("Non-minimalistic encoding of %06x", x);
+                       if (x > 0x10ffff)
+                               reject("Codepoint too high: %06x", x);
+
+                       if (x >= 0x0080 && x <= 0x009f)
+                               reject("C1 control character %04x", x);
+                       if (x >= 0xd800 && x <= 0xdfff)
+                               reject("Surrogate code-point %04x", x);
+                       if ((x & 0xffff) >= 0xfffe)
+                               reject("Non-character %06x", x);
+                       if (x >= 0xe000 && x <= 0xf8ff ||
+                           x >= 0xf0000 && x <= 0xffffd ||
+                           x >= 0x100000 && x <= 0x10fffd)
+                               reject("Private-use character %06x", x);
+               } else {
+                       reject("Invalid byte %02x", c);
+               }
+       }
+
+       return 0;
+}
diff --git a/utf8-check.t b/utf8-check.t
new file mode 100755 (executable)
index 0000000..ee0acc1
--- /dev/null
@@ -0,0 +1,58 @@
+#!/usr/bin/perl
+
+my @good = split /\n/, <<AMEN ;
+09 0d 0a 0c                            # allowed control
+33 31 34 31 35 39 32 3d 70 69          # ASCII
+73 74 c5 99 c3 ad 7a 6c c3 ad 6b 0a    # Czech
+c2 a0                  # first 2-byte
+df bf                  # last 2-byte
+e0 a0 80               # first 3-byte
+ef bf bd               # last valid 3-byte
+f0 90 80 80            # first 4-byte
+f3 af bf bd            # last non-private
+AMEN
+
+my @bad = split /\n/, <<AMEN ;
+1f                     # control
+7f                     # control
+80                     # continuation
+9f                     # continuation
+f8                     # invalid byte
+ff                     # invalid byte
+c1 bf                  # too small 2-byte
+c2 80                  # C1 control
+e0 9f bf               # too small 3-byte
+ef bf be               # non-character
+ef bf bf               # non-character
+f0 8f bf bf            # too small 4-byte
+ee 80 80               # private plane 0
+ef a3 bf               # private plane 0
+f3 bf bf bd            # private plane F
+f4 8f bf bd            # private plane 10
+f4 8f bf be            # non-character
+f4 8f bf bf            # non-character
+ed a4 91               # high surrogate
+ed b0 91               # low surrogate
+AMEN
+
+sub test {
+       my ($in, $outcome) = @_;
+       my ($hex, $cmt) = ($in =~ m{^(.*?)\s+#\s+(.*)$}) or die;
+       my $raw = $hex;
+       $raw =~ s{\s+}{}g;
+       $raw =~ s{([0-9a-fA-F]{2})}{chr hex $1}ge;
+       print "$hex ($cmt): ";
+       open my $p, '|-', './utf8-check' or die;
+       print $p $raw;
+       flush $p;
+       close $p;
+       if ($?) {
+               $outcome or die "Wrong answer\n";
+       } else {
+               !$outcome or die "OK, but should fail\n";
+               print "OK\n";
+       }
+}
+
+test($_, 0) for @good;
+test($_, 1) for @bad;