From: Martin Mares Date: Sat, 11 Oct 2003 10:16:31 +0000 (+0000) Subject: Added a table of compatibility ligature expansions. X-Git-Tag: holmes-import~1192 X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=2765481cdddd496e08cf7bc1288cdf3173074eb8;p=libucw.git Added a table of compatibility ligature expansions. --- diff --git a/charset/U-ligatures.h b/charset/U-ligatures.h new file mode 100644 index 00000000..49c5334e --- /dev/null +++ b/charset/U-ligatures.h @@ -0,0 +1,28 @@ +#define LIG_HASH_SIZE 24 + +static const word *_U_lig_hash[] = { + NULL, + NULL, + NULL, + /* FB13 */ (const word []) { 0x0574, 0x0576, 0 }, + /* FB14 */ (const word []) { 0x0574, 0x0565, 0 }, + /* FB15 */ (const word []) { 0x0574, 0x056B, 0 }, + /* FB16 */ (const word []) { 0x057E, 0x0576, 0 }, + /* FB17 */ (const word []) { 0x0574, 0x056D, 0 }, + /* FB00 */ (const word []) { 0x0066, 0x0066, 0 }, + /* FB01 */ (const word []) { 0x0066, 0x0069, 0 }, + /* FB02 */ (const word []) { 0x0066, 0x006C, 0 }, + /* FB03 */ (const word []) { 0x0066, 0x0066, 0x0069, 0 }, + /* FB04 */ (const word []) { 0x0066, 0x0066, 0x006C, 0 }, + /* FB05 */ (const word []) { 0x0073, 0x0074, 0 }, + /* FB06 */ (const word []) { 0x0073, 0x0074, 0 }, + /* FB4F */ (const word []) { 0x05D0, 0x05DC, 0 }, + NULL, + NULL, + /* 0132 */ (const word []) { 0x0049, 0x004A, 0 }, + /* 0133 */ (const word []) { 0x0069, 0x006A, 0 }, + NULL, + NULL, + NULL, + /* 0587 */ (const word []) { 0x0565, 0x0582, 0 }, +}; diff --git a/charset/misc/gen-ligatures b/charset/misc/gen-ligatures new file mode 100755 index 00000000..8ebe71e6 --- /dev/null +++ b/charset/misc/gen-ligatures @@ -0,0 +1,73 @@ +#!/usr/bin/perl +# +# Generate Expansion Table of Compatibility Ligatures +# (c) 2003 Martin Mares +# + +use strict; +use warnings; + +print STDERR "Reading ligature list\n"; +open(L, "misc/u-ligatures") || die "lig file open"; +my %ligs = (); +while () { + chomp; + $ligs{$_} = 1; +} +close L; + +print STDERR "Reading decompositions\n"; +open(I, "unidata/UnicodeData.txt") || die "Unable to open UniCode data file"; +my %decs = (); +while () { + chomp; + (/^$/ || /^#/) && next; + my ($code,$name,$cat,$comb,$bidir,$decomp,$d0,$d1,$n0,$mirr,$cmt1,$cmt2,$upper,$lower,$title) = split /;/; + $code =~ /^....$/ || next; + if (my ($d) = ($decomp =~ /^ (.*)/)) { + $decs{$code} = $d; + } +} +close I; + +sub expand($) { + my ($c) = @_; + if (defined $decs{$c}) { + return join (" ", map { expand($_) } split(/\s+/, $decs{$c})); + } else { + return $c; + } +} + +print STDERR "Searching for a perfect hash function\n"; +my $n = keys %ligs; +my $div = $n-1; +DIV: while (++$div) { + #print STDERR "Trying $div... "; + my @c = (); + foreach my $l (keys %ligs) { + my $i = (hex $l) % $div; + if (defined $c[$i]) { + #print STDERR "collision\n"; + next DIV; + } + $c[$i] = 1; + } + #print STDERR "FOUND\n"; + last; +} + +print STDERR "Filling hash table with $div entries for $n ligatures\n"; +my @ht = map { "NULL" } 1..$div; +foreach my $l (keys %ligs) { + my $i = (hex $l) % $div; + my $w = join(", ", map { "0x$_" } split(/ /, expand($l))); + $ht[$i] = "/* $l */ (const word []) { $w, 0 }"; +} + +print "#define LIG_HASH_SIZE $div\n\n"; +print "static const word *_U_lig_hash[] = {\n"; +for (my $i=0; $i<$div; $i++) { + print "\t", $ht[$i], ",\n"; +} +print "};\n"; diff --git a/charset/misc/generate b/charset/misc/generate index 61f70dff..a606e7f6 100644 --- a/charset/misc/generate +++ b/charset/misc/generate @@ -10,4 +10,5 @@ misc/table2h _U_lower word U-lower.h misc/table2h _U_upper word U-upper.h misc/gen-unacc >misc/u-unacc misc/table2h _U_unaccent word U-unacc.h +misc/gen-ligatures >U-ligatures.h misc/gen-charconv chartable.h diff --git a/charset/toligatures.c b/charset/toligatures.c new file mode 100644 index 00000000..d5f84252 --- /dev/null +++ b/charset/toligatures.c @@ -0,0 +1,18 @@ +/* + * The UniCode Library -- Table of Ligatures + * + * (c) 2003 Martin Mares + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#include "lib/lib.h" +#include "charset/unicode.h" +#include "charset/U-ligatures.h" + +const word * +Uexpand_lig(uns x) +{ + return _U_lig_hash[x % LIG_HASH_SIZE]; +}