From 9935e649cbebddb44a21df196fb92ad6e703eb23 Mon Sep 17 00:00:00 2001 From: Pavel Charvat Date: Thu, 6 Nov 2008 17:26:41 +0100 Subject: [PATCH] Documented chartype.h and unicode.h. The Cxvalue macro converted to a function. --- ucw/char-cat.c | 1 + ucw/char-lower.c | 1 + ucw/char-upper.c | 1 + ucw/chartype.h | 40 ++++++++----- ucw/doc/Makefile | 2 +- ucw/doc/chartype.txt | 4 ++ ucw/doc/index.txt | 6 +- ucw/doc/unicode.txt | 4 ++ ucw/unicode.h | 130 +++++++++++++++++++++++++++---------------- 9 files changed, 122 insertions(+), 67 deletions(-) create mode 100644 ucw/doc/chartype.txt create mode 100644 ucw/doc/unicode.txt diff --git a/ucw/char-cat.c b/ucw/char-cat.c index 5a984bea..88221fac 100644 --- a/ucw/char-cat.c +++ b/ucw/char-cat.c @@ -7,6 +7,7 @@ * of the GNU Lesser General Public License. */ +#include "ucw/lib.h" #include "ucw/chartype.h" const unsigned char _c_cat[256] = { diff --git a/ucw/char-lower.c b/ucw/char-lower.c index 700ae370..31b55b54 100644 --- a/ucw/char-lower.c +++ b/ucw/char-lower.c @@ -7,6 +7,7 @@ * of the GNU Lesser General Public License. */ +#include "ucw/lib.h" #include "ucw/chartype.h" const unsigned char _c_lower[256] = { diff --git a/ucw/char-upper.c b/ucw/char-upper.c index 9b08809f..3b8d3171 100644 --- a/ucw/char-upper.c +++ b/ucw/char-upper.c @@ -7,6 +7,7 @@ * of the GNU Lesser General Public License. */ +#include "ucw/lib.h" #include "ucw/chartype.h" const unsigned char _c_upper[256] = { diff --git a/ucw/chartype.h b/ucw/chartype.h index 09dc1ec4..930bf90e 100644 --- a/ucw/chartype.h +++ b/ucw/chartype.h @@ -10,6 +10,14 @@ #ifndef _UCW_CHARTYPE_H #define _UCW_CHARTYPE_H +/*** + * We define our own routines to classify 8-bit characters (based on US-ASCII charset). + * This way we bypass most possible problems with different compilation environments. + * + * All functions and macros accept any numeric parameters and if it is necessary, they simply ignore higher bits. + * It does not matter whether a parameter is signed or unsigned. + ***/ + #define _C_UPPER 1 /* Upper-case letters */ #define _C_LOWER 2 /* Lower-case letters */ #define _C_PRINT 4 /* Printable */ @@ -29,21 +37,27 @@ extern const unsigned char _c_cat[256], _c_upper[256], _c_lower[256]; #define Category(x) (_c_cat[(unsigned char)(x)]) #define Ccat(x,y) (Category(x) & y) -#define Cupper(x) Ccat(x, _C_UPPER) -#define Clower(x) Ccat(x, _C_LOWER) -#define Calpha(x) Ccat(x, _C_ALPHA) -#define Calnum(x) Ccat(x, _C_ALNUM) -#define Cprint(x) Ccat(x, _C_PRINT) -#define Cdigit(x) Ccat(x, _C_DIGIT) -#define Cxdigit(x) Ccat(x, _C_XDIGIT) -#define Cword(x) Ccat(x, _C_WORD) -#define Cblank(x) Ccat(x, _C_BLANK) -#define Cctrl(x) Ccat(x, _C_CTRL) +#define Cupper(x) Ccat(x, _C_UPPER) /** Checks for an upper-case character (`A-Z`). **/ +#define Clower(x) Ccat(x, _C_LOWER) /** Checks for a lower-case character (`a-z`). **/ +#define Calpha(x) Ccat(x, _C_ALPHA) /** Checks for an alphabetic character (`a-z`, `A-Z`). **/ +#define Calnum(x) Ccat(x, _C_ALNUM) /** Checks for an alpha-numeric character (`a-z`, `A-Z`, `0-9`). */ +#define Cprint(x) Ccat(x, _C_PRINT) /** Checks for printable characters, including 8-bit values (`\t`, `0x20-0x7E`, `0x80-0xFF`). **/ +#define Cdigit(x) Ccat(x, _C_DIGIT) /** Checks for a digit (`0-9`). **/ +#define Cxdigit(x) Ccat(x, _C_XDIGIT) /** Checks for a hexadecimal digit (`0-9`, `a-f`, `A-F`). **/ +#define Cword(x) Ccat(x, _C_WORD) /** Checks for an alpha-numeric character or an inner punctation (`a-z`, `A-Z`, `0-9`, `_`). **/ +#define Cblank(x) Ccat(x, _C_BLANK) /** Checks for a white space (`0x20`, `\t`, `\n`, `\r`, `0x8`, `0xC`). **/ +#define Cctrl(x) Ccat(x, _C_CTRL) /** Checks for control characters (`0x0-0x1F`, `0x7F`). **/ #define Cspace(x) Cblank(x) -#define Cupcase(x) _c_upper[(unsigned char)(x)] -#define Clocase(x) _c_lower[(unsigned char)(x)] +#define Cupcase(x) (_c_upper[(unsigned char)(x)]) /** Convert a letter to upper case, leave non-letter characters unchanged. **/ +#define Clocase(x) (_c_lower[(unsigned char)(x)]) /** Convert a letter to lower case, leave non-letter characters unchanged. **/ -#define Cxvalue(x) (((x)<'A')?((x)-'0'):(((x)&0xdf)-'A'+10)) +/** + * Compute the value of a valid hexadecimal character (ie. passed the @Cxdigit() check). + **/ +static inline uns Cxvalue(byte x) +{ + return (x < (uns)'A') ? x - '0' : (x & 0xdf) - 'A' + 10; +} #endif diff --git a/ucw/doc/Makefile b/ucw/doc/Makefile index e805f79f..b6b713df 100644 --- a/ucw/doc/Makefile +++ b/ucw/doc/Makefile @@ -2,7 +2,7 @@ DIRS+=ucw/doc -UCW_DOCS=fastbuf index config configure install basecode hash docsys conf mempool mainloop generic growbuf unaligned lists +UCW_DOCS=fastbuf index config configure install basecode hash docsys conf mempool mainloop generic growbuf unaligned lists chartype unicode UCW_INDEX=$(o)/ucw/doc/def_index.html UCW_DOCS_HTML=$(addprefix $(o)/ucw/doc/,$(addsuffix .html,$(UCW_DOCS))) diff --git a/ucw/doc/chartype.txt b/ucw/doc/chartype.txt new file mode 100644 index 00000000..a19560dd --- /dev/null +++ b/ucw/doc/chartype.txt @@ -0,0 +1,4 @@ +Single-byte characters +====================== + +!!ucw/chartype.h diff --git a/ucw/doc/index.txt b/ucw/doc/index.txt index 4621f144..d0410619 100644 --- a/ucw/doc/index.txt +++ b/ucw/doc/index.txt @@ -23,6 +23,8 @@ Modules - <> - <> - <> +- <> +- <> Other features -------------- @@ -53,10 +55,6 @@ Yet undocumented modules * `bitarray.h` * `bitopts.h` * `bitsig.h` -- Character manipulation - * `char-map.h` - * `chartype.h` - * `unicode.h` - String manipulation * `kmp.h` * `kmp-search.h` diff --git a/ucw/doc/unicode.txt b/ucw/doc/unicode.txt new file mode 100644 index 00000000..c58eeaa0 --- /dev/null +++ b/ucw/doc/unicode.txt @@ -0,0 +1,4 @@ +Multi-byte characters +===================== + +!!ucw/unicode.h diff --git a/ucw/unicode.h b/ucw/unicode.h index a9805c06..74416c6d 100644 --- a/ucw/unicode.h +++ b/ucw/unicode.h @@ -16,12 +16,13 @@ /* Macros for handling UTF-8 */ -#define UNI_REPLACEMENT 0xfffc +#define UNI_REPLACEMENT 0xfffc /** Unicode value used as a default replacement of invalid characters. **/ -/* Encode a character from the basic multilingual plane [0, 0xFFFF] - * (subset of Unicode 4.0); up to 3 bytes needed (RFC2279) */ -static inline byte * -utf8_put(byte *p, uns u) +/** + * Encode a value from the range `[0, 0xFFFF]` + * (basic multilingual plane); up to 3 bytes needed (RFC2279). + **/ +static inline byte *utf8_put(byte *p, uns u) { if (u < 0x80) *p++ = u; @@ -40,10 +41,11 @@ utf8_put(byte *p, uns u) return p; } -/* Encode a value from the range [0, 0x7FFFFFFF]; - * (superset of Unicode 4.0) up to 6 bytes needed (RFC2279) */ -static inline byte * -utf8_32_put(byte *p, uns u) +/** + * Encode a value from the range `[0, 0x7FFFFFFF]`; + * (superset of Unicode 4.0) up to 6 bytes needed (RFC2279). + **/ +static inline byte *utf8_32_put(byte *p, uns u) { if (u < 0x80) *p++ = u; @@ -83,10 +85,11 @@ put1: *p++ = 0x80 | (u & 0x3f); #define UTF8_GET_NEXT if (unlikely((*p & 0xc0) != 0x80)) goto bad; u = (u << 6) | (*p++ & 0x3f) -/* Decode a character from the basic multilingual plane [0, 0xFFFF] - * or return 'repl' if the encoding has been corrupted */ -static inline byte * -utf8_get_repl(const byte *p, uns *uu, uns repl) +/** + * Decode a value from the range `[0, 0xFFFF]` (basic multilingual plane) + * or return @repl if the encoding has been corrupted. + **/ +static inline byte *utf8_get_repl(const byte *p, uns *uu, uns repl) { uns u = *p++; if (u < 0x80) @@ -114,10 +117,11 @@ utf8_get_repl(const byte *p, uns *uu, uns repl) return (byte *)p; } -/* Decode a value from the range [0, 0x7FFFFFFF] - * or return 'repl' if the encoding has been corrupted */ -static inline byte * -utf8_32_get_repl(const byte *p, uns *uu, uns repl) +/** + * Decode a value from the range `[0, 0x7FFFFFFF]` + * or return @repl if the encoding has been corrupted. + **/ +static inline byte *utf8_32_get_repl(const byte *p, uns *uu, uns repl) { uns u = *p++; if (u < 0x80) @@ -163,18 +167,20 @@ get1: UTF8_GET_NEXT; return (byte *)p; } -/* Decode a character from the basic multilingual plane [0, 0xFFFF] - * or return UNI_REPLACEMENT if the encoding has been corrupted */ -static inline byte * -utf8_get(const byte *p, uns *uu) +/** + * Decode a value from the range `[0, 0xFFFF]` (basic multilignual plane) + * or return `UNI_REPLACEMENT` if the encoding has been corrupted. + **/ +static inline byte *utf8_get(const byte *p, uns *uu) { return utf8_get_repl(p, uu, UNI_REPLACEMENT); } -/* Decode a value from the range [0, 0x7FFFFFFF] - * or return UNI_REPLACEMENT if the encoding has been corrupted */ -static inline byte * -utf8_32_get(const byte *p, uns *uu) +/** + * Decode a value from the range `[0, 0x7FFFFFFF]` + * or return `UNI_REPLACEMENT` if the encoding has been corrupted. + **/ +static inline byte *utf8_32_get(const byte *p, uns *uu) { return utf8_32_get_repl(p, uu, UNI_REPLACEMENT); } @@ -188,8 +194,10 @@ utf8_32_get(const byte *p, uns *uu) #define UTF8_SKIP_BWD(p) while ((*--(p) & 0xc0) == 0x80) -static inline uns -utf8_space(uns u) +/** + * Return the number of bytes needed to encode a given value from the range `[0, 0x7FFFFFFF]` to UTF-8. + **/ +static inline uns utf8_space(uns u) { if (u < 0x80) return 1; @@ -204,8 +212,10 @@ utf8_space(uns u) return 6; } -static inline uns -utf8_encoding_len(uns c) +/** + * Compute the length of a single UTF-8 character from it's first byte. The encoding must be valid. + **/ +static inline uns utf8_encoding_len(uns c) { if (c < 0x80) return 1; @@ -221,10 +231,11 @@ utf8_encoding_len(uns c) return 6; } -/* Encode a character from the range [0, 0xD7FF] or [0xE000,0x11FFFF]; - * up to 4 bytes needed */ -static inline void * -utf16_le_put(void *p, uns u) +/** + * Encode an UTF-16LE character from the range `[0, 0xD7FF]` or `[0xE000,0x11FFFF]`; + * up to 4 bytes needed. + **/ +static inline void *utf16_le_put(void *p, uns u) { if (u < 0xd800 || (u < 0x10000 && u >= 0xe000)) { @@ -241,8 +252,11 @@ utf16_le_put(void *p, uns u) ASSERT(0); } -static inline void * -utf16_be_put(void *p, uns u) +/** + * Encode an UTF-16BE character from the range `[0, 0xD7FF]` or `[0xE000,0x11FFFF]`; + * up to 4 bytes needed. + **/ +static inline void *utf16_be_put(void *p, uns u) { if (u < 0xd800 || (u < 0x10000 && u >= 0xe000)) { @@ -259,10 +273,11 @@ utf16_be_put(void *p, uns u) ASSERT(0); } -/* Decode a character from the range [0, 0xD7FF] or [0xE000,11FFFF] - * or return `repl' if the encoding has been corrupted */ -static inline void * -utf16_le_get_repl(const void *p, uns *uu, uns repl) +/** + * Decode an UTF-16LE character from the range `[0, 0xD7FF]` or `[0xE000,11FFFF]` + * or return @repl if the encoding has been corrupted. + **/ +static inline void *utf16_le_get_repl(const void *p, uns *uu, uns repl) { uns u = get_u16_le(p), x, y; x = u - 0xd800; @@ -278,8 +293,11 @@ utf16_le_get_repl(const void *p, uns *uu, uns repl) return (void *)(p + 2); } -static inline void * -utf16_be_get_repl(const void *p, uns *uu, uns repl) +/** + * Decode an UTF-16BE character from the range `[0, 0xD7FF]` or `[0xE000,11FFFF]` + * or return @repl if the encoding has been corrupted. + **/ +static inline void *utf16_be_get_repl(const void *p, uns *uu, uns repl) { uns u = get_u16_be(p), x, y; x = u - 0xd800; @@ -295,22 +313,28 @@ utf16_be_get_repl(const void *p, uns *uu, uns repl) return (void *)(p + 2); } -/* Decode a character from the range [0, 0xD7FF] or [0xE000,11FFFF] - * or return UNI_REPLACEMENT if the encoding has been corrupted */ -static inline void * -utf16_le_get(const void *p, uns *uu) +/** + * Decode an UTF-16LE character from the range `[0, 0xD7FF]` or `[0xE000,11FFFF]` + * or return `UNI_REPLACEMENT` if the encoding has been corrupted. + **/ +static inline void *utf16_le_get(const void *p, uns *uu) { return utf16_le_get_repl(p, uu, UNI_REPLACEMENT); } -static inline void * -utf16_be_get(const void *p, uns *uu) +/** + * Decode an UTF-16BE character from the range `[0, 0xD7FF]` or `[0xE000,11FFFF]` + * or return `UNI_REPLACEMENT` if the encoding has been corrupted. + **/ +static inline void *utf16_be_get(const void *p, uns *uu) { return utf16_be_get_repl(p, uu, UNI_REPLACEMENT); } -static inline uns -unicode_sanitize_char(uns u) +/** + * Check an Unicode value and if it seems to be useless (defined by Ucwlib; it may change in future) return `UNI_REPLACEMENT` instead. + **/ +static inline uns unicode_sanitize_char(uns u) { if (u >= 0x10000 || // We don't accept anything outside the basic plane u >= 0xd800 && u < 0xf900 || // neither we do surrogates @@ -322,7 +346,15 @@ unicode_sanitize_char(uns u) /* unicode-utf8.c */ +/** + * Count the number of Unicode character in a zero-terminated UTF-8 string. + * Returned value for corrupted encoding is undefined, but is never greater than `strlen(str)`. + **/ uns utf8_strlen(const byte *str); + +/** + * Same as @utf8_strlen(), but returns at most @n characters. + **/ uns utf8_strnlen(const byte *str, uns n); #endif -- 2.39.2