2 * Check that the input is a proper UTF-8 file
4 * Written in 2017 by Martin Mares <mj@ucw.cz>
5 * and placed into public domain.
11 typedef unsigned int uint;
13 static void reject(const char *msg, uint arg)
24 while ((c = getchar()) >= 0) {
26 if (c != '\r' && c != '\n' && c != '\t' && c != 0x0c)
27 reject("ASCII control character %02x (only CR, LF, HT, FF allowed)", c);
28 } else if (c < 0x80) {
30 reject("ASCII DEL not allowed", c);
31 } else if (c < 0xc0) {
32 reject("Unexpected continuation byte %02x", c);
33 } else if (c < 0xf8) {
34 uint bytes = 1 + (c >= 0xe0) + (c >= 0xf0);
35 uint x = c & (0x3f >> bytes);
36 for (uint i=0; i<bytes; i++) {
38 if (c < 0x80 || c >= 0xc0)
39 reject("Incomplete multi-byte sequence at byte %02x", c);
40 x = (x << 6) | (c & 0x3f);
43 static const uint min_code[] = { 0, 0x80, 0x800, 0x10000 };
44 if (x < min_code[bytes])
45 reject("Non-minimalistic encoding of %06x", x);
47 reject("Codepoint too high: %06x", x);
49 if (x >= 0x0080 && x <= 0x009f)
50 reject("C1 control character %04x", x);
51 if (x >= 0xd800 && x <= 0xdfff)
52 reject("Surrogate code-point %04x", x);
53 if ((x & 0xffff) >= 0xfffe)
54 reject("Non-character %06x", x);
55 if (x >= 0xe000 && x <= 0xf8ff ||
56 x >= 0xf0000 && x <= 0xffffd ||
57 x >= 0x100000 && x <= 0x10fffd)
58 reject("Private-use character %06x", x);
60 reject("Invalid byte %02x", c);