- block->l = (l_sum >> 4);
- block->u = (u_sum >> 4);
- block->v = (v_sum >> 4);
-
- /* Apply Daubechies wavelet transformation
- * FIXME:
- * - MMX/SSE instructions or tables could be faster
- * - maybe it would be better to compute Luv and wavelet separately because of processor cache or MMX/SSE
- * - eliminate slow square roots
- * - what about Haar transformation? */
-
-#define DAUB_0 31651 /* (1 + sqrt 3) / (4 * sqrt 2) */
-#define DAUB_1 54822 /* (3 + sqrt 3) / (4 * sqrt 2) */
-#define DAUB_2 14689 /* (3 - sqrt 3) / (4 * sqrt 2) */
-#define DAUB_3 -8481 /* (1 - sqrt 3) / (4 * sqrt 2) */
-
- /* ... to the rows */
- uns i;
- for (i = 0; i < 16; i += 4)
- {
- s[i + 0] = (DAUB_0 * t[i + 2] + DAUB_1 * t[i + 3] + DAUB_2 * t[i + 0] + DAUB_3 * t[i + 1]) / 0x10000;
- s[i + 1] = (DAUB_0 * t[i + 0] + DAUB_1 * t[i + 1] + DAUB_2 * t[i + 2] + DAUB_3 * t[i + 3]) / 0x10000;
- s[i + 2] = (DAUB_3 * t[i + 2] - DAUB_2 * t[i + 3] + DAUB_1 * t[i + 0] - DAUB_0 * t[i + 1]) / 0x10000;
- s[i + 3] = (DAUB_3 * t[i + 0] - DAUB_2 * t[i + 1] + DAUB_1 * t[i + 2] - DAUB_0 * t[i + 3]) / 0x10000;
- }
-
- /* ... and to the columns... skip LL band */
- for (i = 0; i < 2; i++)
- {
- t[i + 8] = (DAUB_3 * s[i + 8] - DAUB_2 * s[i +12] + DAUB_1 * s[i + 0] - DAUB_0 * s[i + 4]) / 0x1000;
- t[i +12] = (DAUB_3 * s[i + 0] - DAUB_2 * s[i + 4] + DAUB_1 * s[i + 8] - DAUB_0 * s[i +12]) / 0x1000;
- }
- for (; i < 4; i++)
- {
- t[i + 0] = (DAUB_0 * s[i + 8] + DAUB_1 * s[i +12] + DAUB_2 * s[i + 0] + DAUB_3 * s[i + 4]) / 0x1000;
- t[i + 4] = (DAUB_0 * s[i + 0] + DAUB_1 * s[i + 4] + DAUB_2 * s[i + 8] + DAUB_3 * s[i +12]) / 0x1000;
- t[i + 8] = (DAUB_3 * s[i + 8] - DAUB_2 * s[i +12] + DAUB_1 * s[i + 0] - DAUB_0 * s[i + 4]) / 0x1000;
- t[i +12] = (DAUB_3 * s[i + 0] - DAUB_2 * s[i + 4] + DAUB_1 * s[i + 8] - DAUB_0 * s[i +12]) / 0x1000;
- }
+ /* Extract energies in LH, HL and HH bands */
+ block->v[3] = fast_sqrt_u32(isqr(t[8]) + isqr(t[9]) + isqr(t[12]) + isqr(t[13]));
+ block->v[4] = fast_sqrt_u32(isqr(t[2]) + isqr(t[3]) + isqr(t[6]) + isqr(t[7]));
+ block->v[5] = fast_sqrt_u32(isqr(t[10]) + isqr(t[11]) + isqr(t[14]) + isqr(t[15]));
+ sum[3] += block->v[3] * block->area;
+ sum[4] += block->v[4] * block->area;
+ sum[5] += block->v[5] * block->area;
+ }
+ }