]> mj.ucw.cz Git - libucw.git/blob - lib/unicode.h
renamed attr_set_type() to put_attr_set_type()
[libucw.git] / lib / unicode.h
1 /*
2  *      Sherlock Library -- Unicode Characters
3  *
4  *      (c) 1997--2004 Martin Mares <mj@ucw.cz>
5  *      (c) 2004 Robert Spalek <robert@ucw.cz>
6  *
7  *      This software may be freely distributed and used according to the terms
8  *      of the GNU Lesser General Public License.
9  */
10
11 #ifndef _UNICODE_H
12 #define _UNICODE_H
13
14 /* Macros for handling UTF-8 */
15
16 #define UNI_REPLACEMENT 0xfffc
17
18 #define PUT_UTF8(p,u) do {              \
19   if (u < 0x80)                         \
20     *p++ = u;                           \
21   else if (u < 0x800)                   \
22     {                                   \
23       *p++ = 0xc0 | (u >> 6);           \
24       *p++ = 0x80 | (u & 0x3f);         \
25     }                                   \
26   else                                  \
27     {                                   \
28       *p++ = 0xe0 | (u >> 12);          \
29       *p++ = 0x80 | ((u >> 6) & 0x3f);  \
30       *p++ = 0x80 | (u & 0x3f);         \
31     }                                   \
32   } while(0)
33
34 #define PUT_UTF8_32(p,u) do {           \
35   if (u < (1<<16))                      \
36     PUT_UTF8(p,u);                      \
37   else if (u < (1<<21))                 \
38     {                                   \
39       *p++ = 0xf0 | (u >> 18);          \
40       *p++ = 0x80 | ((u >> 12) & 0x3f); \
41       *p++ = 0x80 | ((u >> 6) & 0x3f);  \
42       *p++ = 0x80 | (u & 0x3f);         \
43     }                                   \
44   else if (u < (1<<26))                 \
45     {                                   \
46       *p++ = 0xf8 | (u >> 24);          \
47       *p++ = 0x80 | ((u >> 18) & 0x3f); \
48       *p++ = 0x80 | ((u >> 12) & 0x3f); \
49       *p++ = 0x80 | ((u >> 6) & 0x3f);  \
50       *p++ = 0x80 | (u & 0x3f);         \
51     }                                   \
52   else if (u < (1U<<31))                \
53     {                                   \
54       *p++ = 0xfc | (u >> 30);          \
55       *p++ = 0x80 | ((u >> 24) & 0x3f); \
56       *p++ = 0x80 | ((u >> 18) & 0x3f); \
57       *p++ = 0x80 | ((u >> 12) & 0x3f); \
58       *p++ = 0x80 | ((u >> 6) & 0x3f);  \
59       *p++ = 0x80 | (u & 0x3f);         \
60     }                                   \
61   } while(0)
62
63 #define IS_UTF8(c) ((c) >= 0xc0)
64
65 #define GET_UTF8_CHAR(p,u) do {         \
66     if (*p >= 0xf0)                     \
67       { /* Too large, use replacement char */   \
68         p++;                            \
69         while ((*p & 0xc0) == 0x80)     \
70           p++;                          \
71         u = UNI_REPLACEMENT;            \
72       }                                 \
73     else if (*p >= 0xe0)                \
74       {                                 \
75         u = *p++ & 0x0f;                \
76         if ((*p & 0xc0) == 0x80)        \
77           u = (u << 6) | (*p++ & 0x3f); \
78         if ((*p & 0xc0) == 0x80)        \
79           u = (u << 6) | (*p++ & 0x3f); \
80       }                                 \
81     else                                \
82       {                                 \
83         u = *p++ & 0x1f;                \
84         if ((*p & 0xc0) == 0x80)        \
85           u = (u << 6) | (*p++ & 0x3f); \
86       }                                 \
87   } while (0)                           \
88
89 #define GET_UTF8_32_CHAR(p,u) do {      \
90     if (*p < 0xf0)                      \
91       GET_UTF8_CHAR(p,u);               \
92     else if (*p < 0xf8)                 \
93       {                                 \
94         u = *p++ & 0x07;                \
95         if ((*p & 0xc0) == 0x80)        \
96           u = (u << 6) | (*p++ & 0x3f); \
97         if ((*p & 0xc0) == 0x80)        \
98           u = (u << 6) | (*p++ & 0x3f); \
99         if ((*p & 0xc0) == 0x80)        \
100           u = (u << 6) | (*p++ & 0x3f); \
101       }                                 \
102     else if (*p < 0xfc)                 \
103       {                                 \
104         u = *p++ & 0x03;                \
105         if ((*p & 0xc0) == 0x80)        \
106           u = (u << 6) | (*p++ & 0x3f); \
107         if ((*p & 0xc0) == 0x80)        \
108           u = (u << 6) | (*p++ & 0x3f); \
109         if ((*p & 0xc0) == 0x80)        \
110           u = (u << 6) | (*p++ & 0x3f); \
111         if ((*p & 0xc0) == 0x80)        \
112           u = (u << 6) | (*p++ & 0x3f); \
113       }                                 \
114     else if (*p < 0xfe)                 \
115       {                                 \
116         u = *p++ & 0x01;                \
117         if ((*p & 0xc0) == 0x80)        \
118           u = (u << 6) | (*p++ & 0x3f); \
119         if ((*p & 0xc0) == 0x80)        \
120           u = (u << 6) | (*p++ & 0x3f); \
121         if ((*p & 0xc0) == 0x80)        \
122           u = (u << 6) | (*p++ & 0x3f); \
123         if ((*p & 0xc0) == 0x80)        \
124           u = (u << 6) | (*p++ & 0x3f); \
125         if ((*p & 0xc0) == 0x80)        \
126           u = (u << 6) | (*p++ & 0x3f); \
127       }                                 \
128     else                                \
129       { /* Too large, use replacement char */   \
130         p++;                            \
131         while ((*p & 0xc0) == 0x80)     \
132           p++;                          \
133         u = UNI_REPLACEMENT;            \
134       }                                 \
135   } while (0)                           \
136
137 #define GET_UTF8(p,u)                   \
138     if (IS_UTF8(*p))                    \
139       GET_UTF8_CHAR(p,u);               \
140     else                                \
141       u = *p++
142
143 #define GET_UTF8_32(p,u)                \
144     if (IS_UTF8(*p))                    \
145       GET_UTF8_32_CHAR(p,u);            \
146     else                                \
147       u = *p++
148
149 #define UTF8_SKIP(p) do {                               \
150     uns c = *p++;                                       \
151     if (c >= 0xc0)                                      \
152       while (c & 0x40 && *p >= 0x80 && *p < 0xc0)       \
153         p++, c <<= 1;                                   \
154   } while (0)
155
156 #define UTF8_SKIP_BWD(p) while ((*--(p) & 0xc0) == 0x80)
157
158 static inline uns
159 utf8_space(uns u)
160 {
161   if (u < 0x80)
162     return 1;
163   if (u < 0x800)
164     return 2;
165   if (u < (1<<16))
166     return 3;
167   if (u < (1<<21))
168     return 4;
169   if (u < (1<<26))
170     return 5;
171   return 6;
172 }
173
174 static inline uns
175 utf8_encoding_len(uns c)
176 {
177   if (c < 0x80)
178     return 1;
179   ASSERT(c >= 0xc0 && c < 0xfe);
180   if (c < 0xe0)
181     return 2;
182   if (c < 0xf0)
183     return 3;
184   if (c < 0xf8)
185     return 4;
186   if (c < 0xfc)
187     return 5;
188   return 6;
189 }
190
191 /* unicode-utf8.c */
192
193 uns utf8_strlen(byte *str);
194 uns utf8_strnlen(byte *str, uns n);
195
196 #endif