]> mj.ucw.cz Git - misc.git/blob - utf8-check.c
Merge branch 'master' of git+ssh://git.ucw.cz/home/mj/GIT/misc
[misc.git] / utf8-check.c
1 /*
2  *      Check that the input is a proper UTF-8 file
3  *
4  *      Written in 2017 by Martin Mares <mj@ucw.cz>
5  *      and placed into public domain.
6  */
7
8 #include <stdio.h>
9 #include <stdlib.h>
10
11 typedef unsigned int uint;
12
13 static void reject(const char *msg, uint arg)
14 {
15         printf("Error: ");
16         printf(msg, arg);
17         putchar('\n');
18         exit(1);
19 }
20
21 int main(void)
22 {
23         int c;
24         while ((c = getchar()) >= 0) {
25                 if (c < 0x20) {
26                         if (c != '\r' && c != '\n' && c != '\t' && c != 0x0c)
27                                 reject("ASCII control character %02x (only CR, LF, HT, FF allowed)", c);
28                 } else if (c < 0x80) {
29                         if (c == 0x7f)
30                                 reject("ASCII DEL not allowed", c);
31                 } else if (c < 0xc0) {
32                         reject("Unexpected continuation byte %02x", c);
33                 } else if (c < 0xf8) {
34                         uint bytes = 1 + (c >= 0xe0) + (c >= 0xf0);
35                         uint x = c & (0x3f >> bytes);
36                         for (uint i=0; i<bytes; i++) {
37                                 c = getchar();
38                                 if (c < 0x80 || c >= 0xc0)
39                                         reject("Incomplete multi-byte sequence at byte %02x", c);
40                                 x = (x << 6) | (c & 0x3f);
41                         }
42
43                         static const uint min_code[] = { 0, 0x80, 0x800, 0x10000 };
44                         if (x < min_code[bytes])
45                                 reject("Non-minimalistic encoding of %06x", x);
46                         if (x > 0x10ffff)
47                                 reject("Codepoint too high: %06x", x);
48
49                         if (x >= 0x0080 && x <= 0x009f)
50                                 reject("C1 control character %04x", x);
51                         if (x >= 0xd800 && x <= 0xdfff)
52                                 reject("Surrogate code-point %04x", x);
53                         if ((x & 0xffff) >= 0xfffe)
54                                 reject("Non-character %06x", x);
55                         if (x >= 0xe000 && x <= 0xf8ff ||
56                             x >= 0xf0000 && x <= 0xffffd ||
57                             x >= 0x100000 && x <= 0x10fffd)
58                                 reject("Private-use character %06x", x);
59                 } else {
60                         reject("Invalid byte %02x", c);
61                 }
62         }
63
64         return 0;
65 }