From b82e82eb1013f87a2bae298818949d86047680e2 Mon Sep 17 00:00:00 2001 From: Pavel Charvat Date: Mon, 21 Jan 2008 15:36:49 +0100 Subject: [PATCH] Fixed several bugs in UTF-16 encoding/decoding (tested on large files). --- charset/charconv-gen.h | 3 +++ charset/charconv.c | 41 +++++++++++++++++++++++++---------------- 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/charset/charconv-gen.h b/charset/charconv-gen.h index 5a83232f..7a7f0eff 100644 --- a/charset/charconv-gen.h +++ b/charset/charconv-gen.h @@ -13,6 +13,7 @@ do { /*** Header ***/ + const byte *s, *se; byte *d, *de; uns code; @@ -193,6 +194,7 @@ got_code: else { write_slow: + c->code = code; c->state = UTF16_BE_WRITE; goto go_slow; } @@ -217,6 +219,7 @@ write_slow: else { write_slow: + c->code = code; c->state = UTF16_LE_WRITE; goto go_slow; } diff --git a/charset/charconv.c b/charset/charconv.c index ce22d1e0..54dac42b 100644 --- a/charset/charconv.c +++ b/charset/charconv.c @@ -89,6 +89,7 @@ seq: } if (c->code >= 0x10000) c->code = 0xfffd; +got_char: c->source = s; c->state = 0; return -1; @@ -133,19 +134,23 @@ seq: { void *p = &c->code; c->string_at = p; - if (c->code < 0xd800 || c->code - 0xe000 < 0x2000) + uns code = c->code; + c->string_at = p; + if (code < 0xd800 || code - 0xe000 < 0x2000) {} - else if ((c->code -= 0x10000) < 0x100000) + else if ((code -= 0x10000) < 0x100000) { - put_u16_be(p, 0xd800 | (c->code >> 10)); - put_u16_be(p + 2, 0xdc00 | (c->code & 0x3ff)); + put_u16_be(p, 0xd800 | (code >> 10)); + put_u16_be(p + 2, 0xdc00 | (code & 0x3ff)); c->remains = 4; + c->state = SEQ_WRITE; goto seq; } else - c->code = UNI_REPLACEMENT; - put_u16_be(p, c->code); + code = UNI_REPLACEMENT; + put_u16_be(p, code); c->remains = 2; + c->state = SEQ_WRITE; goto seq; } @@ -154,18 +159,22 @@ seq: { void *p = &c->code; c->string_at = p; - if (c->code < 0xd800 || c->code - 0xe000 < 0x2000) + uns code = c->code; + c->string_at = p; + if (code < 0xd800 || code - 0xe000 < 0x2000) {} - else if ((c->code -= 0x10000) < 0x100000) + else if ((code -= 0x10000) < 0x100000) { - put_u16_le(p, 0xd800 | (c->code >> 10)); - put_u16_le(p + 2, 0xdc00 | (c->code & 0x3ff)); + put_u16_le(p, 0xd800 | (code >> 10)); + put_u16_le(p + 2, 0xdc00 | (code & 0x3ff)); c->remains = 4; + c->state = SEQ_WRITE; } else - c->code = UNI_REPLACEMENT; - put_u16_le(p, c->code); + code = UNI_REPLACEMENT; + put_u16_le(p, code); c->remains = 2; + c->state = SEQ_WRITE; goto seq; } @@ -181,7 +190,7 @@ seq: goto cse; c->code = (c->code << 8) | *s++; if (c->code - 0xd800 >= 0x800) - break; + goto got_char; c->code = (c->code - 0xd800) << 10; c->state = UTF16_BE_READ_2; /* fall-thru */ @@ -203,7 +212,7 @@ seq: else c->code = UNI_REPLACEMENT; s++; - break; + goto got_char; /* Reading of UTF16-LE */ case UTF16_LE_READ: @@ -217,7 +226,7 @@ seq: goto cse; c->code |= *s++ << 8; if (c->code - 0xd800 >= 0x800) - break; + goto got_char; c->code = (c->code - 0xd800) << 10; c->state = UTF16_LE_READ_2; /* fall-thru */ @@ -235,7 +244,7 @@ seq: else c->code = UNI_REPLACEMENT; s++; - break; + goto got_char; default: ASSERT(0); -- 2.39.5