From 790966f9e1eb198ec8e6374d66c9211b01fd9a12 Mon Sep 17 00:00:00 2001 From: Martin Mares Date: Mon, 25 Mar 2013 16:43:15 +0100 Subject: [PATCH] New module: variable-length integer encoding Written by Tomas Valla. --- ucw/Makefile | 16 ++-- ucw/ff-varint.c | 106 ++++++++++++++++++++++++ ucw/ff-varint.h | 56 +++++++++++++ ucw/ff-varint.t | 11 +++ ucw/varint.c | 112 +++++++++++++++++++++++++ ucw/varint.h | 213 ++++++++++++++++++++++++++++++++++++++++++++++++ ucw/varint.t | 70 ++++++++++++++++ 7 files changed, 578 insertions(+), 6 deletions(-) create mode 100644 ucw/ff-varint.c create mode 100644 ucw/ff-varint.h create mode 100644 ucw/ff-varint.t create mode 100644 ucw/varint.c create mode 100644 ucw/varint.h create mode 100644 ucw/varint.t diff --git a/ucw/Makefile b/ucw/Makefile index 6c47e712..9f1247fb 100644 --- a/ucw/Makefile +++ b/ucw/Makefile @@ -15,9 +15,9 @@ LIBUCW_MODS= \ log log-stream log-file log-syslog log-conf tbf \ conf-context conf-alloc conf-dump conf-input conf-intr conf-journal conf-parse conf-section conf-getopt \ ipaccess \ - fastbuf ff-binary ff-string ff-printf ff-unicode ff-stkstring \ + fastbuf ff-binary ff-string ff-printf ff-unicode ff-varint ff-stkstring \ fb-file fb-mem fb-temp tempfile fb-mmap fb-limfd fb-buffer fb-grow fb-pool fb-atomic fb-param fb-socket fb-multi \ - char-cat char-upper char-lower unicode stkstring \ + char-cat char-upper char-lower unicode varint stkstring \ wildmatch regex \ prime primetable random \ time-stamp time-timer time-conf \ @@ -41,7 +41,7 @@ LIBUCW_MAIN_INCLUDES= \ lib.h log.h threads.h time.h \ mempool.h eltpool.h \ clists.h slists.h simple-lists.h \ - string.h stkstring.h unicode.h chartype.h regex.h \ + string.h stkstring.h unicode.h varint.h chartype.h regex.h \ wildmatch.h \ unaligned.h \ bbuf.h gbuf.h gary.h bitarray.h bitsig.h \ @@ -51,7 +51,7 @@ LIBUCW_MAIN_INCLUDES= \ prime.h \ bitops.h \ conf.h getopt.h ipaccess.h \ - fastbuf.h io.h ff-unicode.h ff-binary.h \ + fastbuf.h io.h ff-unicode.h ff-varint.h ff-binary.h \ url.h \ mainloop.h \ process.h \ @@ -97,6 +97,8 @@ endif $(o)/ucw/hashfunc.o $(o)/ucw/hashfunc.oo: CFLAGS += -funroll-loops $(o)/ucw/lizard.o: CFLAGS += $(COPT2) -funroll-loops +$(o)/ucw/ff-varint-t: $(LIBUCW) +$(o)/ucw/varint-t: $(LIBUCW) $(o)/ucw/conf-test: $(o)/ucw/conf-test.o $(LIBUCW) $(o)/ucw/hash-test: $(o)/ucw/hash-test.o $(LIBUCW) $(o)/ucw/hashfunc-test: $(o)/ucw/hashfunc-test.o $(LIBUCW) @@ -112,13 +114,14 @@ endif $(o)/ucw/ipaccess-test: $(o)/ucw/ipaccess-test.o $(LIBUCW) $(o)/ucw/trie-test: $(o)/ucw/trie-test.o $(LIBUCW) -TESTS+=$(addprefix $(o)/ucw/,regex.test unicode.test hash-test.test mempool.test stkstring.test \ - slists.test bbuf.test kmp-test.test getopt.test ff-unicode.test eltpool.test \ +TESTS+=$(addprefix $(o)/ucw/,varint.test regex.test unicode.test hash-test.test mempool.test stkstring.test \ + slists.test bbuf.test kmp-test.test getopt.test ff-unicode.test ff-varint.test eltpool.test \ fb-socket.test trie-test.test string.test sha1.test asort-test.test binheap-test.test \ redblack-test.test fb-file.test fb-grow.test fb-pool.test fb-atomic.test \ fb-limfd.test fb-temp.test fb-mem.test fb-buffer.test fb-mmap.test fb-multi.test url.test strtonum-test.test \ gary.test time.test crc.test signames.test md5.test) +$(o)/ucw/varint.test: $(o)/ucw/varint-t $(o)/ucw/regex.test: $(o)/ucw/regex-t $(o)/ucw/unicode.test: $(o)/ucw/unicode-t $(o)/ucw/hash-test.test: $(o)/ucw/hash-test @@ -130,6 +133,7 @@ $(o)/ucw/kmp-test.test: $(o)/ucw/kmp-test $(o)/ucw/bbuf.test: $(o)/ucw/bbuf-t $(o)/ucw/getopt.test: $(o)/ucw/getopt-t $(o)/ucw/ff-unicode.test: $(o)/ucw/ff-unicode-t +$(o)/ucw/ff-varint.test: $(o)/ucw/ff-varint-t $(o)/ucw/eltpool.test: $(o)/ucw/eltpool-t $(o)/ucw/string.test: $(o)/ucw/str-hex-t $(o)/ucw/str-esc-t $(o)/ucw/str-fix-t $(o)/ucw/sha1.test: $(o)/ucw/sha1-t $(o)/ucw/sha1-hmac-t diff --git a/ucw/ff-varint.c b/ucw/ff-varint.c new file mode 100644 index 00000000..453a26f7 --- /dev/null +++ b/ucw/ff-varint.c @@ -0,0 +1,106 @@ +/* + * UCW Library: Reading and writing Varints on Fastbuf Streams + * + * (c) 2013 Tomas Valla + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#include +#include +#include +#include +#include + + +u64 bget_varint_slow(struct fastbuf *b, u64 repl) +{ + uns h = bgetc(b); + uns l = varint_len(h); + byte buf[l]; + buf[0] = h; + l--; + if (breadb(b, buf+1, l) < l) + return repl; + varint_get(buf, &repl); + return repl; +} + +void bput_varint_slow(struct fastbuf *b, u64 u) +{ + byte buf[9]; + uns l = varint_put(buf, u); + bwrite(b, buf, l); +} + +#ifdef TEST + +#include +#include + +int main(int argc, char **argv) +{ + #define FUNCS \ + F(BGET_VARINT) F(BPUT_VARINT) + + enum { + #define F(x) FUNC_##x, + FUNCS + #undef F + }; + char *names[] = { + #define F(x) [FUNC_##x] = #x, + FUNCS + #undef F + }; + + uns func = ~0U; + if (argc > 1) + for (uns i = 0; i < ARRAY_SIZE(names); i++) + if (!strcasecmp(names[i], argv[1])) + func = i; + if (!~func) { + fprintf(stderr, "Invalid usage!\n"); + return 1; + } + + struct fastbuf *b = fbgrow_create(8); + switch (func) { + uns u; + u64 r; + int i; + case FUNC_BGET_VARINT: + while (scanf("%x", &u) == 1) + bputc(b, u); + fbgrow_rewind(b); + while (bpeekc(b) >= 0) { + if (btell(b)) + putchar(' '); + r = bget_varint_slow(b, ~0LLU); + printf("%lx", r); + } + putchar('\n'); + break; + + case FUNC_BPUT_VARINT: + i = 0; + while (scanf("%lx", &r) == 1) + bput_varint_slow(b, r); + fbgrow_rewind(b); + while (bpeekc(b) >= 0) { + if (i++) + putchar(' '); + printf("%02x", bgetc(b)); + } + putchar('\n'); + break; + default: + ASSERT(0); + } + + bclose(b); + return 0; +} + +#endif diff --git a/ucw/ff-varint.h b/ucw/ff-varint.h new file mode 100644 index 00000000..3228fbfa --- /dev/null +++ b/ucw/ff-varint.h @@ -0,0 +1,56 @@ +/* + * UCW Library: Reading and writing Varints on Fastbuf Streams + * + * (c) 2013 Tomas Valla + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#ifndef _UCW_FF_VARINT_H +#define _UCW_FF_VARINT_H + +#include +#include + +u64 bget_varint_slow(struct fastbuf *b, u64 repl); +void bput_varint_slow(struct fastbuf *b, u64 u); + +/** + * Reads u64 encoded as varint from the fastbuf b. + * If the read is unsuccessful, returns repl. + **/ +static inline u64 bget_varint_repl(struct fastbuf *b, u64 repl) +{ + uns l; + if (bavailr(b) >= 1) { + l = varint_len(*b->bptr); + if (bavailr(b) >= l) { + b->bptr += l; + varint_get(b->bptr, &repl); + return repl; + } + } + return bget_varint_slow(b, repl); +} + +/** + * Reads u64 encoded as varint from the fastbuf b. + * If the read is unsuccessful, returns ~0LLU. + **/ +static inline int bget_varint(struct fastbuf *b) +{ + return bget_varint_repl(b, ~0LLU); +} + +/** Writes u64 u encoded as varint to the fastbuf b. **/ +static inline void bput_varint(struct fastbuf *b, u64 u) +{ + uns l = varint_space(u); + if (bavailw(b) >= l) + b->bptr += varint_put(b->bptr, u); + else + bput_varint_slow(b, u); +} + +#endif diff --git a/ucw/ff-varint.t b/ucw/ff-varint.t new file mode 100644 index 00000000..685ae433 --- /dev/null +++ b/ucw/ff-varint.t @@ -0,0 +1,11 @@ +# Tests for the FF-Varint module + +Name: bput_varint +Run: ../obj/ucw/ff-varint-t bput_varint +In: deadbeef 2222 feeddeadbabebeef +Out: f0 ce 8d 7e 6f a1 a2 ff fd eb da a5 aa 9e 7e 6f + +Name: bget_varint +Run: ../obj/ucw/ff-varint-t bget_varint +In: ff fd eb da a5 aa 9e 7e 6f f0 ce 8d 7e 6f a1 a2 +Out: feeddeadbabebeef deadbeef 2222 diff --git a/ucw/varint.c b/ucw/varint.c new file mode 100644 index 00000000..18debb14 --- /dev/null +++ b/ucw/varint.c @@ -0,0 +1,112 @@ +/* + * UCW Library -- Coding of u64 into variable length bytecode. + * + * (c) 2013 Tomas Valla + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#include +#include + +#define PUTB(j,i) p[j] = (byte)((u >> (8*(i)))); +#define PUTB4(b) PUTB(0,b-1) PUTB(1,b-2) PUTB(2,b-3) PUTB(3,b-4) +uns varint_put_big(byte *p, u64 u) +{ + ASSERT(u >= VARINT_SHIFT_L4); + + if (u < VARINT_SHIFT_L5) { + u -= VARINT_SHIFT_L4; + PUTB4(5) + p[0] |= 0xf0; + PUTB(4,0); + return 5; + } + if (u < VARINT_SHIFT_L6) { + u -= VARINT_SHIFT_L5; + PUTB4(6) + PUTB(4,1) PUTB(5,0) + p[0] |= 0xf8; + return 6; + } + if (u < VARINT_SHIFT_L7) { + u -= VARINT_SHIFT_L6; + PUTB4(7) + p[0] |= 0xfc; + PUTB(4,2) PUTB(5,1) PUTB(6,0) + return 7; + } + if (u < VARINT_SHIFT_L8) { + u -= VARINT_SHIFT_L7; + p[0] = 0xfe; + PUTB(1,6) PUTB(2,5) PUTB(3,4) PUTB(4,3) + PUTB(5,2) PUTB(6,1) PUTB(7,0) + return 8; + } + u -= VARINT_SHIFT_L8; + p[0] = 0xff; + PUTB(1,7) PUTB(2,6) PUTB(3,5) PUTB(4,4) + PUTB(5,3) PUTB(6,2) PUTB(7,1) PUTB(8,0) + return 9; +} + +const byte *varint_get_big(const byte *p, u64 *r) +{ + ASSERT((*p & 0xf0) == 0xf0); + + byte h = ~*p; + if (h & 0x08) { + *r = (u64)(p[0] & 7)<<32 | (u64)p[1]<<24 | (u64)p[2]<<16 | (u64)p[3]<<8 | (u64)p[4]; + *r += VARINT_SHIFT_L4; + return p+5; + } + if (h & 0x04) { + *r = (u64)(p[0] & 3)<<40 | (u64)p[1]<<32 | (u64)p[2]<<24 | (u64)p[3]<<16 | (u64)p[4]<<8 | (u64)p[5]; + *r += VARINT_SHIFT_L5; + return p+6; + } + if (h & 0x02) { + *r = (u64)(p[0] & 1)<<48 | (u64)p[1]<<40 | (u64)p[2]<<32 | (u64)p[3]<<24 | (u64)p[4]<<16 | (u64)p[5]<<8 | (u64)p[6]; + *r += VARINT_SHIFT_L6; + return p+7; + } + if (h & 0x01) { + *r = (u64)p[1]<<48 | (u64)p[2]<<40 | (u64)p[3]<<32 | (u64)p[4]<<24 | (u64)p[5]<<16 | (u64)p[6]<<8 | (u64)p[7]; + *r += VARINT_SHIFT_L7; + return p+8; + } + *r = ((u64)p[1] << 56) | ((u64)p[2] << 48) | ((u64)p[3] << 40) | ((u64)p[4] << 32) | ((u64)p[5] << 24) | ((u64)p[6] << 16) | ((u64)p[7] << 8) | (u64)p[8]; + *r += VARINT_SHIFT_L8; + return p+9; +} + + +#ifdef TEST + +#include +#include + +int main(int argc, char **argv UNUSED) +{ + byte buf[16] = { 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa }; + u64 u; + + if (scanf("%lx", &u) != 1) { + fprintf(stderr, "Invalid usage!\n"); + return 1; + } + varint_put(buf, u); + int l = varint_len(buf[0]); + varint_get(buf, &u); + printf("%u %d %jx", varint_space(u), l, (uintmax_t) u); + if (argc > 1) { + for (int i=0; i + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#ifndef _UCW_VARINT_H +#define _UCW_VARINT_H + +/*** + * The encoding works in the following way: + * + * First byte Stored values + * + * 0xxxxxxx 7 bits + * 10xxxxxx +1 byte 14 bits, value shifted by 2^7 + * 110xxxxx +2 bytes 21 bits, value shifted by 2^7+2^14 + * 1110xxxx +3 bytes 28 bits, value shifted by 2^7+2^14+2^21 + * .... + * 11111110 +7 bytes 56 bits, value shifted by 2^7+2^14+2^21+2^28+2^35+2^42 + * 11111111 +8 bytes full 64 bits, value shifted by 2^7+2^14+2^21+2^28+2^35+2^42+2^56 + * + * The values are stored in bigendian to allow lexicographic sorting. + * The encoding and algorithms are aimed to be optimised for storing shorter numbers. + * + * There is an invalid combination: if the first two bytes are 0xff 0xff. + ***/ + + +#define VARINT_SHIFT_L1 0x80 +#define VARINT_SHIFT_L2 0x4080 +#define VARINT_SHIFT_L3 0x204080 +#define VARINT_SHIFT_L4 0x10204080 +#define VARINT_SHIFT_L5 0x0810204080 +#define VARINT_SHIFT_L6 0x040810204080 +#define VARINT_SHIFT_L7 0x02040810204080 +#define VARINT_SHIFT_L8 0x0102040810204080 + +/* + * The following code is already unrolled, to maximize the speed. + * Yes, it is ugly. + */ + +#define PUTB(j,i) p[j] = (byte)((u >> (8*(i)))); +#define PUTB4(b) PUTB(0,b-1) PUTB(1,b-2) PUTB(2,b-3) PUTB(3,b-4) + +/* for internal use only, need the length > 4 */ +uns varint_put_big(byte *p, u64 u); +const byte *varint_get_big(const byte *p, u64 *r); + +/** + * Encode u64 value u into byte sequence p. + * Returns the number of bytes used (at least 1 and at most 9). + **/ +static inline uns varint_put(byte *p, u64 u) +{ + if (u < VARINT_SHIFT_L1) { + p[0] = (byte)u; + return 1; + } + if (u < VARINT_SHIFT_L2) { + u -= VARINT_SHIFT_L1; + PUTB(0,1) + p[0] |= 0x80; + PUTB(1,0) + return 2; + } + if (u < VARINT_SHIFT_L3) { + u -= VARINT_SHIFT_L2; + PUTB(0,2) + p[0] |= 0xc0; + PUTB(1,1) PUTB(2,0) + return 3; + } + if (u < VARINT_SHIFT_L4) { + u -= VARINT_SHIFT_L3; + PUTB4(4) + p[0] |= 0xe0; + return 4; + } + return varint_put_big(p, u); +} +#undef GETB +#undef GETB4 + +/** + * Decode u64 value from a byte sequence p into res. + * The invalid combination is not detected. + * Returns pointer to the next position after the sequence. + **/ +static inline const byte *varint_get(const byte *p, u64 *res) +{ + byte h = ~*p; + + if (h & 0x80) { + *res = p[0]; + return p+1; + } + if (h & 0x40) { + *res = (p[0] & 0x3f)<<8 | p[1]; + *res += VARINT_SHIFT_L1; + return p+2; + } + if (h & 0x20) { + *res = (p[0] & 0x1f)<<16 | p[1]<<8 | p[2]; + *res += VARINT_SHIFT_L2; + return p+3; + } + if (h & 0x10) { + *res = (p[0] & 0xf)<<24 | p[1]<<16 | p[2]<<8 | p[3]; + *res += VARINT_SHIFT_L3; + return p+4; + } + return varint_get_big(p, res); +} + +/** + * Decode at most u32 value from a byte sequence p into u32 res. + * The invalid combination is not detected. + * Returns pointer to the next position after the sequence. + * If the stored number cannot fit into u32, fill res by 0xffffffff and returns p. + **/ +static inline const byte *varint_get32(const byte *p, u32 *res) +{ + byte h = ~*p; + + if (h & 0x80) { + *res = p[0]; + return p+1; + } + if (h & 0x40) { + *res = (p[0] & 0x3f)<<8 | p[1]; + *res += VARINT_SHIFT_L1; + return p+2; + } + if (h & 0x20) { + *res = (p[0] & 0x1f)<<16 | p[1]<<8 | p[2]; + *res += VARINT_SHIFT_L2; + return p+3; + } + if (h & 0x10) { + *res = (p[0] & 0xf)<<24 | p[1]<<16 | p[2]<<8 | p[3]; + *res += VARINT_SHIFT_L3; + return p+4; + } + u64 r = (u64)(p[0] & 7)<<32 | (u64)p[1]<<24 | (u64)p[2]<<16 | (u64)p[3]<<8 | (u64)p[4]; + r += VARINT_SHIFT_L4; + if (r > 0xffffffff) { + *res = 0xffffffff; + return p; + } + *res = r; + return p+5; +} + +/** Does the byte sequence p code the invalid sequence? **/ +static inline int varint_invalid(const byte *p) +{ + return p[0] == 0xff && p[1] == 0xff; +} + +/** + * Store the invalid sequence. + * Returns always 2 (2 bytes were used, to be consistent with varint_put). + **/ +static inline uns varint_put_invalid(byte *p) +{ + p[0] = p[1] = 0xff; + return 2; +} + +/** Compute the length of encoding in bytes from the first byte hdr of the encoding. **/ +static inline uns varint_len(const byte hdr) +{ + byte b = ~hdr; + + uns l = 0; + if (!b) + l = -1; + else { + if (b & 0xf0) { l += 4; b &= 0xf0; } + if (b & 0xcc) { l += 2; b &= 0xcc; } + if (b & 0xaa) l++; + } + return 8 - l; +} + +/** Compute the number of bytes needed to store the value u. **/ +static inline uns varint_space(u64 u) +{ + if (u < VARINT_SHIFT_L1) + return 1; + if (u < VARINT_SHIFT_L2) + return 2; + if (u < VARINT_SHIFT_L3) + return 3; + if (u < VARINT_SHIFT_L4) + return 4; + if (u < VARINT_SHIFT_L5) + return 5; + if (u < VARINT_SHIFT_L6) + return 6; + if (u < VARINT_SHIFT_L7) + return 7; + if (u < VARINT_SHIFT_L8) + return 8; + return 9; +} + +#endif diff --git a/ucw/varint.t b/ucw/varint.t new file mode 100644 index 00000000..05cc9c1e --- /dev/null +++ b/ucw/varint.t @@ -0,0 +1,70 @@ +# Tests for the Varint module +# In format: +# Out format: +# when run as varint-t full: +# Out format: ... + +Name: varint (1) +Run: ../obj/ucw/varint-t full +In: 41 +Out: 1 1 41 41 + +Name: varint (2) +Run: ../obj/ucw/varint-t full +In: 80 +Out: 2 2 80 80 0 + +Name: varint (3) +Run: ../obj/ucw/varint-t full +In: ab +Out: 2 2 ab 80 2b + +Name: varint (4) +Run: ../obj/ucw/varint-t full +In: 0 +Out: 1 1 0 0 + +Name: varint (5) +Run: ../obj/ucw/varint-t full +In: 2222 +Out: 2 2 2222 a1 a2 + +Name: varint (6) +Run: ../obj/ucw/varint-t +In: abcd +Out: 3 3 abcd + +Name: varint (7) +Run: ../obj/ucw/varint-t +In: abcde +Out: 3 3 abcde + +Name: varint (8) +Run: ../obj/ucw/varint-t +In: 1020407f +Out: 4 4 1020407f + +Name: varint (9) +Run: ../obj/ucw/varint-t full +In: deadbeef +Out: 5 5 deadbeef f0 ce 8d 7e 6f + +Name: varint (10) +Run: ../obj/ucw/varint-t +In: 4081020400f +Out: 6 6 4081020400f + +Name: varint (11) +Run: ../obj/ucw/varint-t +In: 20408010204070 +Out: 8 8 20408010204070 + +Name: varint (12) +Run: ../obj/ucw/varint-t +In: feeddeadbabebeef +Out: 9 9 feeddeadbabebeef + +Name: varint (13) +Run: ../obj/ucw/varint-t +In: 102040810204080 +Out: 9 9 102040810204080 -- 2.39.2