From 5bdb091643bbc6e76b75fde68088785f88e9f224 Mon Sep 17 00:00:00 2001 From: Pavel Charvat Date: Wed, 23 Aug 2006 16:38:26 +0200 Subject: [PATCH] optimized integer square root --- images/Makefile | 2 +- images/math.c | 64 ++++++++++++++++++++++++++++ images/math.h | 103 ++++++++++++++++++++++++++++++++++++++++++++++ images/sig-cmp.c | 9 ++-- images/sig-init.c | 7 ++-- 5 files changed, 177 insertions(+), 8 deletions(-) create mode 100644 images/math.c create mode 100644 images/math.h diff --git a/images/Makefile b/images/Makefile index efd5aec1..47e267d1 100644 --- a/images/Makefile +++ b/images/Makefile @@ -6,7 +6,7 @@ PROGS+=$(addprefix $(o)/images/,image-tool image-dup-test image-sim-test) CONFIGS+=images -LIBIMAGES_MODS=config image scale color alpha io-main dup-init dup-cmp sig-dump sig-init sig-cmp object +LIBIMAGES_MODS=math config image scale color alpha io-main dup-init dup-cmp sig-dump sig-init sig-cmp object LIBIMAGES_LIBS=-lm diff --git a/images/math.c b/images/math.c new file mode 100644 index 00000000..f88daa60 --- /dev/null +++ b/images/math.c @@ -0,0 +1,64 @@ +/* + * Image Library -- Math routines + * + * (c) 2006 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU General Public License. + */ + +#include "lib/lib.h" +#include "images/math.h" + +const u32 fast_div_tab[] = { + 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757, + 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154, + 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709, + 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333, + 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367, + 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283, + 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315, + 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085, + 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498, + 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675, + 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441, + 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183, + 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712, + 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400, + 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163, + 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641, + 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573, + 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737, + 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493, + 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373, + 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368, + 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671, + 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767, + 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740, + 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751, + 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635, + 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593, + 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944, + 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933, + 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575, + 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532, + 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010 }; + +const byte fast_sqrt_tab[] = { + 0, 16, 23, 28, 32, 36, 39, 43, 46, 48, 51, 53, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 75, 77, 79, 80, 82, 83, 85, 86, 88, 89, + 91, 92, 94, 95, 96, 98, 99, 100, 101, 103, 104, 105, 106, 108, 109, 110, + 111, 112, 113, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, + 143, 144, 145, 146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155, 156, + 157, 158, 159, 159, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168, 169, + 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178, 179, 180, 181, + 181, 182, 183, 183, 184, 185, 186, 186, 187, 188, 188, 189, 190, 190, 191, 192, + 192, 193, 194, 194, 195, 196, 196, 197, 198, 198, 199, 199, 200, 201, 201, 202, + 203, 203, 204, 205, 205, 206, 206, 207, 208, 208, 209, 210, 210, 211, 211, 212, + 213, 213, 214, 214, 215, 216, 216, 217, 217, 218, 219, 219, 220, 220, 221, 221, + 222, 223, 223, 224, 224, 225, 225, 226, 227, 227, 228, 228, 229, 229, 230, 230, + 231, 232, 232, 233, 233, 234, 234, 235, 235, 236, 237, 237, 238, 238, 239, 239, + 240, 240, 241, 241, 242, 242, 243, 243, 244, 245, 245, 246, 246, 247, 247, 248, + 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255, 255, 255 }; + diff --git a/images/math.h b/images/math.h new file mode 100644 index 00000000..1824c95b --- /dev/null +++ b/images/math.h @@ -0,0 +1,103 @@ +#ifndef _IMAGES_MATH_H +#define _IMAGES_MATH_H + +extern const u32 fast_div_tab[]; +extern const byte fast_sqrt_tab[]; + +static inline uns +isqr(int x) +{ + return x * x; +} + +static inline uns +fast_div_u32_u8(uns x, uns y) +{ +#ifdef CPU_I386 + int ret, dmy; + asm volatile ( + "mull %3" + :"=d"(ret),"=a"(dmy) + :"1"(x),"g"(fast_div_tab[y]) + ); + return ret; +#else + return ((u64)(x) * fast_div_tab[y]) >> 32; +#endif +} + +static inline uns +fast_sqrt_u16(uns x) +{ + uns y; + if (x < (1 << 10) - 3) + y = fast_sqrt_tab[(x + 3) >> 2] >> 3; + else if (x < (1 << 14) - 28) + y = fast_sqrt_tab[(x + 28) >> 6] >> 1; + else + y = fast_sqrt_tab[x >> 8]; + return (x < y * y) ? y - 1 : y; +} + +static inline uns +fast_sqrt_u32(uns x) +{ + uns y; + if (x < (1 << 16)) + { + if (x < (1 << 10) - 3) + y = fast_sqrt_tab[(x + 3) >> 2] >> 3; + else if (x < (1 << 14) - 28) + y = fast_sqrt_tab[(x + 28) >> 6] >> 1; + else + y = fast_sqrt_tab[x >> 8]; + } + else + { + if (x < (1 << 24)) + { + if (x < (1 << 20)) + { + y = fast_sqrt_tab[x >> 12]; + y = (fast_div_u32_u8(x, y) >> 3) + (y << 1); + } + else + { + y = fast_sqrt_tab[x >> 16]; + y = (fast_div_u32_u8(x, y) >> 5) + (y << 3); + } + } + else + { + if (x < (1 << 28)) + { + if (x < (1 << 26)) + { + y = fast_sqrt_tab[x >> 18]; + y = (fast_div_u32_u8(x, y) >> 6) + (y << 4); + } + else + { + y = fast_sqrt_tab[x >> 20]; + y = (fast_div_u32_u8(x, y) >> 7) + (y << 5); + } + } + else + { + if (x < (1 << 30)) + { + y = fast_sqrt_tab[x >> 22]; + y = (fast_div_u32_u8(x, y) >> 8) + (y << 6); + } + else + { + y = fast_sqrt_tab[x >> 24]; + y = (fast_div_u32_u8(x, y) >> 9) + (y << 7); + } + } + } + } + return (x < y * y) ? y - 1 : y; +} + +#endif diff --git a/images/sig-cmp.c b/images/sig-cmp.c index 6367643b..ec3f1e86 100644 --- a/images/sig-cmp.c +++ b/images/sig-cmp.c @@ -11,6 +11,7 @@ #include "lib/lib.h" #include "lib/math.h" +#include "images/math.h" #include "images/images.h" #include "images/signature.h" #include @@ -57,8 +58,8 @@ image_signatures_dist(struct image_signature *sig1, struct image_signature *sig2 f = MIN(f, mf[i][j]); h = MIN(h, mh[i][j]); } - lf[i] = (df * 0x10000) / (df + (int)sqrt(f)); - lh[i] = (dh * 0x10000) / (dh + (int)sqrt(h)); + lf[i] = (df * 0x10000) / (df + fast_sqrt_u32(f)); + lh[i] = (dh * 0x10000) / (dh + fast_sqrt_u32(h)); lfs += lf[i] * (6 * reg1[i].wa + 2 * reg1[i].wb); lhs += lh[i] * reg1[i].wa; } @@ -70,8 +71,8 @@ image_signatures_dist(struct image_signature *sig1, struct image_signature *sig2 f = MIN(f, mf[j][i]); h = MIN(h, mh[j][i]); } - lf[i + cnt1] = (df * 0x10000) / (df + (int)sqrt(f)); - lh[i + cnt1] = (dh * 0x10000) / (dh + (int)sqrt(h)); + lf[i + cnt1] = (df * 0x10000) / (df + fast_sqrt_u32(f)); + lh[i + cnt1] = (dh * 0x10000) / (dh + fast_sqrt_u32(h)); lfs += lf[i] * (6 * reg2[i].wa + 2 * reg2[i].wb); lhs += lh[i] * reg2[i].wa; } diff --git a/images/sig-init.c b/images/sig-init.c index 585210af..41b47723 100644 --- a/images/sig-init.c +++ b/images/sig-init.c @@ -12,6 +12,7 @@ #include "sherlock/sherlock.h" #include "lib/math.h" #include "lib/fastbuf.h" +#include "images/math.h" #include "images/images.h" #include "images/color.h" #include "images/signature.h" @@ -283,9 +284,9 @@ compute_image_signature(struct image_thread *thread UNUSED, struct image_signatu } /* Extract energies in LH, HL and HH bands */ - block->lh = CLAMP((int)(sqrt(t[8] * t[8] + t[9] * t[9] + t[12] * t[12] + t[13] * t[13]) / 16), 0, 255); - block->hl = CLAMP((int)(sqrt(t[2] * t[2] + t[3] * t[3] + t[6] * t[6] + t[7] * t[7]) / 16), 0, 255); - block->hh = CLAMP((int)(sqrt(t[10] * t[10] + t[11] * t[11] + t[14] * t[14] + t[15] * t[15]) / 16), 0, 255); + block->lh = fast_sqrt_u16(isqr(t[8]) + isqr(t[9]) + isqr(t[12]) + isqr(t[13])); + block->hl = fast_sqrt_u16(isqr(t[2]) + isqr(t[3]) + isqr(t[6]) + isqr(t[7])); + block->hh = fast_sqrt_u16(isqr(t[10]) + isqr(t[11]) + isqr(t[14]) + isqr(t[15])); } } -- 2.39.2