2 * Image Library -- Image scaling algorithms
4 * (c) 2006 Pavel Charvat <pchar@ucw.cz>
6 * This software may be freely distributed and used according to the terms
7 * of the GNU Lesser General Public License.
10 #ifndef IMAGE_SCALE_CHANNELS
11 # define IMAGE_SCALE_CHANNELS IMAGE_SCALE_PIXEL_SIZE
14 #undef IMAGE_COPY_PIXEL
15 #if IMAGE_SCALE_PIXEL_SIZE == 1
16 #define IMAGE_COPY_PIXEL(dest, src) do{ *(byte *)dest = *(byte *)src; }while(0)
17 #elif IMAGE_SCALE_PIXEL_SIZE == 2
18 #define IMAGE_COPY_PIXEL(dest, src) do{ *(u16 *)dest = *(u16 *)src; }while(0)
19 #elif IMAGE_SCALE_PIXEL_SIZE == 3
20 #define IMAGE_COPY_PIXEL(dest, src) do{ ((byte *)dest)[0] = ((byte *)src)[0]; ((byte *)dest)[1] = ((byte *)src)[1]; ((byte *)dest)[2] = ((byte *)src)[2]; }while(0)
21 #elif IMAGE_SCALE_PIXEL_SIZE == 4
22 #define IMAGE_COPY_PIXEL(dest, src) do{ *(u32 *)dest = *(u32 *)src; }while(0)
26 IMAGE_SCALE_PREFIX(nearest_xy)(struct image *dest, struct image *src)
28 uns x_inc = (src->cols << 16) / dest->cols;
29 uns y_inc = (src->rows << 16) / dest->rows;
30 uns x_start = x_inc >> 1, x_pos;
31 uns y_pos = y_inc >> 1;
33 # define IMAGE_WALK_PREFIX(x) walk_##x
34 # define IMAGE_WALK_INLINE
35 # define IMAGE_WALK_UNROLL 4
36 # define IMAGE_WALK_IMAGE dest
37 # define IMAGE_WALK_COL_STEP IMAGE_SCALE_PIXEL_SIZE
38 # define IMAGE_WALK_DO_ROW_START do{ row_start = src->pixels + (y_pos >> 16) * src->row_size; y_pos += y_inc; x_pos = x_start; }while(0)
39 # define IMAGE_WALK_DO_STEP do{ byte *pos = row_start + (x_pos >> 16) * IMAGE_SCALE_PIXEL_SIZE; x_pos += x_inc; IMAGE_COPY_PIXEL(walk_pos, pos); }while(0)
40 # include "images/image-walk.h"
43 #if 0 /* Experiments with rearranging pixels for SSE... */
45 IMAGE_SCALE_PREFIX(linear_x)(struct image *dest, struct image *src)
47 /* Handle problematic special case */
48 byte *src_row = src->pixels;
49 byte *dest_row = dest->pixels;
52 for (uns y_counter = dest->rows; y_counter--; )
56 src_row += src->row_size;
57 dest_row += dest->row_size;
61 /* Initialize the main loop */
62 uns x_inc = ((src->cols - 1) << 16) / (dest->cols - 1);
63 # define COLS_AT_ONCE 256
64 byte pixel_buf[COLS_AT_ONCE * 2 * IMAGE_SCALE_PIXEL_SIZE]; /* Buffers should fit in cache */
65 u16 coef_buf[COLS_AT_ONCE * IMAGE_SCALE_PIXEL_SIZE];
67 for (uns y_counter = dest->rows; y_counter--; )
70 byte *dest_pos = dest_row;
71 for (uns x_counter = dest->cols; --x_counter; )
72 for (uns x_counter = dest->cols; x_counter > COLS_AT_ONCE; x_counter -= COLS_AT_ONCE)
74 byte *pixel_buf_pos = pixel_buf;
75 u16 *coef_buf_pos = coef_buf;
76 for (uns i = 0; i < COLS_AT_ONCE / 2; i++)
78 byte *src_pos = src_row + (x_pos >> 16) * IMAGE_SCALE_PIXEL_SIZE;
79 uns ofs = x_pos & 0xffff;
81 byte *src_pos_2 = src_row + (x_pos >> 16) * IMAGE_SCALE_PIXEL_SIZE;
82 uns ofs_2 = x_pos & 0xffff;
84 *coef_buf_pos++ = ofs;
85 byte *pixel_buf_pos_2 = pixel_buf_pos + IMAGE_SCALE_PIXEL_SIZE;
86 byte *pixel_buf_pos_3 = pixel_buf_pos + IMAGE_SCALE_PIXEL_SIZE * 2;
87 byte *pixel_buf_pos_4 = pixel_buf_pos + IMAGE_SCALE_PIXEL_SIZE * 3;
88 IMAGE_COPY_PIXEL(pixel_buf_pos, src_pos);
89 IMAGE_COPY_PIXEL(pixel_buf_pos_2, src_pos + IMAGE_SCALE_PIXEL_SIZE);
90 IMAGE_COPY_PIXEL(pixel_buf_pos_3, src_pos_2);
91 IMAGE_COPY_PIXEL(pixel_buf_pos_4, src_pos_2 + IMAGE_SCALE_PIXEL_SIZE);
92 pixel_buf_pos += 4 * IMAGE_SCALE_PIXEL_SIZE;
93 *coef_buf_pos++ = ofs_2;
96 byte *src_pos = src_row + (x_pos >> 16) * IMAGE_SCALE_PIXEL_SIZE;
97 uns ofs = x_pos & 0xffff;
99 dest_pos[0] = LINEAR_INTERPOLATE(src_pos[0], src_pos[0 + IMAGE_SCALE_PIXEL_SIZE], ofs);
100 # if IMAGE_SCALE_CHANNELS >= 2
101 dest_pos[1] = LINEAR_INTERPOLATE(src_pos[1], src_pos[1 + IMAGE_SCALE_PIXEL_SIZE], ofs);
103 # if IMAGE_SCALE_CHANNELS >= 3
104 dest_pos[2] = LINEAR_INTERPOLATE(src_pos[2], src_pos[2 + IMAGE_SCALE_PIXEL_SIZE], ofs);
106 # if IMAGE_SCALE_CHANNELS >= 4
107 dest_pos[3] = LINEAR_INTERPOLATE(src_pos[3], src_pos[3 + IMAGE_SCALE_PIXEL_SIZE], ofs);
109 dest_pos += IMAGE_SCALE_PIXEL_SIZE;*/
112 /* Always copy the last column - handle "x_pos == dest->cols * 0x10000" overflow */
113 IMAGE_COPY_PIXEL(dest_pos, src_row + src->row_pixels_size - IMAGE_SCALE_PIXEL_SIZE);
115 src_row += src->row_size;
116 dest_row += dest->row_size;
122 IMAGE_SCALE_PREFIX(bilinear_xy)(struct image *dest, struct image *src)
124 uns x_inc = (((src->cols - 1) << 16) - 1) / (dest->cols);
125 uns y_inc = (((src->rows - 1) << 16) - 1) / (dest->rows);
127 byte *cache[2], buf1[dest->row_pixels_size + 16], buf2[dest->row_pixels_size + 16], *pbuf[2];
128 byte *dest_row = dest->pixels, *dest_pos;
129 uns cache_index = ~0U, cache_i = 0;
130 pbuf[0] = cache[0] = ALIGN_PTR((void *)buf1, 16);
131 pbuf[1] = cache[1] = ALIGN_PTR((void *)buf2, 16);
133 __m128i zero = _mm_setzero_si128();
135 for (uns row_counter = dest->rows; row_counter--; )
138 uns y_index = y_pos >> 16;
139 uns y_ofs = y_pos & 0xffff;
142 if (y_index > (uns)(cache_index + 1))
143 cache_index = y_index - 1;
144 while (y_index > cache_index)
147 cache[1] = pbuf[cache_i ^= 1];
149 byte *src_row = src->pixels + cache_index * src->row_size;
150 byte *cache_pos = cache[1];
151 for (uns col_counter = dest->cols; --col_counter; )
153 byte *c1 = src_row + (x_pos >> 16) * IMAGE_SCALE_PIXEL_SIZE;
154 byte *c2 = c1 + IMAGE_SCALE_PIXEL_SIZE;
155 uns ofs = x_pos & 0xffff;
156 cache_pos[0] = LINEAR_INTERPOLATE(c1[0], c2[0], ofs);
157 # if IMAGE_SCALE_CHANNELS >= 2
158 cache_pos[1] = LINEAR_INTERPOLATE(c1[1], c2[1], ofs);
160 # if IMAGE_SCALE_CHANNELS >= 3
161 cache_pos[2] = LINEAR_INTERPOLATE(c1[2], c2[2], ofs);
163 # if IMAGE_SCALE_CHANNELS >= 4
164 cache_pos[3] = LINEAR_INTERPOLATE(c1[3], c2[3], ofs);
166 cache_pos += IMAGE_SCALE_PIXEL_SIZE;
169 IMAGE_COPY_PIXEL(cache_pos, src_row + src->row_pixels_size - IMAGE_SCALE_PIXEL_SIZE);
173 __m128i coef = _mm_set1_epi16(y_ofs >> 9);
174 for (; (int)i < (int)dest->row_pixels_size - 15; i += 16)
176 __m128i a2 = _mm_loadu_si128((__m128i *)(cache[0] + i));
177 __m128i a1 = _mm_unpacklo_epi8(a2, zero);
178 a2 = _mm_unpackhi_epi8(a2, zero);
179 __m128i b2 = _mm_loadu_si128((__m128i *)(cache[1] + i));
180 __m128i b1 = _mm_unpacklo_epi8(b2, zero);
181 b2 = _mm_unpackhi_epi8(b2, zero);
182 b1 = _mm_sub_epi16(b1, a1);
183 b2 = _mm_sub_epi16(b2, a2);
184 a1 = _mm_slli_epi16(a1, 7);
185 a2 = _mm_slli_epi16(a2, 7);
186 b1 = _mm_mullo_epi16(b1, coef);
187 b2 = _mm_mullo_epi16(b2, coef);
188 a1 = _mm_add_epi16(a1, b1);
189 a2 = _mm_add_epi16(a2, b2);
190 a1 = _mm_srli_epi16(a1, 7);
191 a2 = _mm_srli_epi16(a2, 7);
192 a1 = _mm_packus_epi16(a1, a2);
193 _mm_storeu_si128((__m128i *)(dest_pos + i), a1);
196 for (; (int)i < (int)dest->row_pixels_size - 3; i += 4)
198 dest_pos[i + 0] = LINEAR_INTERPOLATE(cache[0][i + 0], cache[1][i + 0], y_ofs);
199 dest_pos[i + 1] = LINEAR_INTERPOLATE(cache[0][i + 1], cache[1][i + 1], y_ofs);
200 dest_pos[i + 2] = LINEAR_INTERPOLATE(cache[0][i + 2], cache[1][i + 2], y_ofs);
201 dest_pos[i + 3] = LINEAR_INTERPOLATE(cache[0][i + 3], cache[1][i + 3], y_ofs);
204 for (; i < dest->row_pixels_size; i++)
205 dest_pos[i] = LINEAR_INTERPOLATE(cache[0][i], cache[1][i], y_ofs);
206 dest_row += dest->row_size;
212 IMAGE_SCALE_PREFIX(downsample_xy)(struct image *dest, struct image *src)
215 byte *rsrc = src->pixels, *psrc;
216 byte *rdest = dest->pixels, *pdest;
217 u64 x_inc = ((u64)dest->cols << 32) / src->cols, x_pos;
218 u64 y_inc = ((u64)dest->rows << 32) / src->rows, y_pos = 0;
219 uns x_inc_frac = (u64)0xffffffffff / x_inc;
220 uns y_inc_frac = (u64)0xffffffffff / y_inc;
221 uns final_mul = ((u64)(x_inc >> 16) * (y_inc >> 16)) >> 16;
222 uns buf_size = dest->cols * IMAGE_SCALE_CHANNELS;
223 u32 buf[buf_size], *pbuf;
224 buf_size *= sizeof(u32);
225 bzero(buf, buf_size);
226 for (uns rows_counter = src->rows; rows_counter--; )
230 rsrc += src->row_size;
233 if (y_pos <= 0x100000000)
235 for (uns cols_counter = src->cols; cols_counter--; )
238 if (x_pos <= 0x100000000)
241 # if IMAGE_SCALE_CHANNELS >= 2
244 # if IMAGE_SCALE_CHANNELS >= 3
247 # if IMAGE_SCALE_CHANNELS >= 4
253 x_pos -= 0x100000000;
254 uns mul2 = (uns)(x_pos >> 16) * x_inc_frac;
255 uns mul1 = 0xffffff - mul2;
256 pbuf[0] += (psrc[0] * mul1) >> 24;
257 pbuf[0 + IMAGE_SCALE_CHANNELS] += (psrc[0] * mul2) >> 24;
258 # if IMAGE_SCALE_CHANNELS >= 2
259 pbuf[1] += (psrc[1] * mul1) >> 24;
260 pbuf[1 + IMAGE_SCALE_CHANNELS] += (psrc[1] * mul2) >> 24;
262 # if IMAGE_SCALE_CHANNELS >= 3
263 pbuf[2] += (psrc[2] * mul1) >> 24;
264 pbuf[2 + IMAGE_SCALE_CHANNELS] += (psrc[2] * mul2) >> 24;
266 # if IMAGE_SCALE_CHANNELS >= 4
267 pbuf[3] += (psrc[3] * mul1) >> 24;
268 pbuf[3 + IMAGE_SCALE_CHANNELS] += (psrc[3] * mul2) >> 24;
270 pbuf += IMAGE_SCALE_CHANNELS;
272 psrc += IMAGE_SCALE_PIXEL_SIZE;
277 y_pos -= 0x100000000;
279 rdest += dest->row_size;
280 uns mul2 = (uns)(y_pos >> 16) * y_inc_frac;
281 uns mul1 = 0xffffff - mul2;
283 # if IMAGE_SCALE_CHANNELS >= 2
286 # if IMAGE_SCALE_CHANNELS >= 3
289 # if IMAGE_SCALE_CHANNELS >= 4
292 for (uns cols_counter = src->cols; cols_counter--; )
295 if (x_pos <= 0x100000000)
297 pbuf[0] += ((psrc[0] * mul1) >> 24);
298 a0 += (psrc[0] * mul2) >> 24;
299 # if IMAGE_SCALE_CHANNELS >= 2
300 pbuf[1] += ((psrc[1] * mul1) >> 24);
301 a1 += (psrc[1] * mul2) >> 24;
303 # if IMAGE_SCALE_CHANNELS >= 3
304 pbuf[2] += ((psrc[2] * mul1) >> 24);
305 a2 += (psrc[2] * mul2) >> 24;
307 # if IMAGE_SCALE_CHANNELS >= 4
308 pbuf[3] += ((psrc[3] * mul1) >> 24);
309 a3 += (psrc[3] * mul2) >> 24;
314 x_pos -= 0x100000000;
315 uns mul4 = (uns)(x_pos >> 16) * x_inc_frac;
316 uns mul3 = 0xffffff - mul4;
317 uns mul13 = ((u64)mul1 * mul3) >> 24;
318 uns mul23 = ((u64)mul2 * mul3) >> 24;
319 uns mul14 = ((u64)mul1 * mul4) >> 24;
320 uns mul24 = ((u64)mul2 * mul4) >> 24;
321 pdest[0] = ((((psrc[0] * mul13) >> 24) + pbuf[0]) * final_mul) >> 16;
322 pbuf[0] = ((psrc[0] * mul23) >> 24) + a0;
323 pbuf[0 + IMAGE_SCALE_CHANNELS] += ((psrc[0 + IMAGE_SCALE_PIXEL_SIZE] * mul14) >> 24);
324 a0 = ((psrc[0 + IMAGE_SCALE_PIXEL_SIZE] * mul24) >> 24);
325 # if IMAGE_SCALE_CHANNELS >= 2
326 pdest[1] = ((((psrc[1] * mul13) >> 24) + pbuf[1]) * final_mul) >> 16;
327 pbuf[1] = ((psrc[1] * mul23) >> 24) + a1;
328 pbuf[1 + IMAGE_SCALE_CHANNELS] += ((psrc[1 + IMAGE_SCALE_PIXEL_SIZE] * mul14) >> 24);
329 a1 = ((psrc[1 + IMAGE_SCALE_PIXEL_SIZE] * mul24) >> 24);
331 # if IMAGE_SCALE_CHANNELS >= 3
332 pdest[2] = ((((psrc[2] * mul13) >> 24) + pbuf[2]) * final_mul) >> 16;
333 pbuf[2] = ((psrc[2] * mul23) >> 24) + a2;
334 pbuf[2 + IMAGE_SCALE_CHANNELS] += ((psrc[2 + IMAGE_SCALE_PIXEL_SIZE] * mul14) >> 24);
335 a2 = ((psrc[2 + IMAGE_SCALE_PIXEL_SIZE] * mul24) >> 24);
337 # if IMAGE_SCALE_CHANNELS >= 4
338 pdest[3] = ((((psrc[3] * mul13) >> 24) + pbuf[3]) * final_mul) >> 16;
339 pbuf[3] = ((psrc[3] * mul23) >> 24) + a3;
340 pbuf[3 + IMAGE_SCALE_CHANNELS] += ((psrc[3 + IMAGE_SCALE_PIXEL_SIZE] * mul14) >> 24);
341 a3 = ((psrc[3 + IMAGE_SCALE_PIXEL_SIZE] * mul24) >> 24);
343 pbuf += IMAGE_SCALE_CHANNELS;
344 pdest += IMAGE_SCALE_PIXEL_SIZE;
346 psrc += IMAGE_SCALE_PIXEL_SIZE;
348 pdest[0] = (pbuf[0] * final_mul) >> 16;
350 # if IMAGE_SCALE_CHANNELS >= 2
351 pdest[1] = (pbuf[1] * final_mul) >> 16;
354 # if IMAGE_SCALE_CHANNELS >= 3
355 pdest[2] = (pbuf[2] * final_mul) >> 16;
358 # if IMAGE_SCALE_CHANNELS >= 4
359 pdest[3] = (pbuf[3] * final_mul) >> 16;
366 for (uns cols_counter = dest->cols; cols_counter--; )
368 pdest[0] = (pbuf[0] * final_mul) >> 16;
369 # if IMAGE_SCALE_CHANNELS >= 2
370 pdest[1] = (pbuf[1] * final_mul) >> 16;
372 # if IMAGE_SCALE_CHANNELS >= 3
373 pdest[2] = (pbuf[2] * final_mul) >> 16;
375 # if IMAGE_SCALE_CHANNELS >= 4
376 pdest[3] = (pbuf[3] * final_mul) >> 16;
378 pbuf += IMAGE_SCALE_CHANNELS;
379 pdest += IMAGE_SCALE_PIXEL_SIZE;
383 #undef IMAGE_SCALE_PREFIX
384 #undef IMAGE_SCALE_PIXEL_SIZE
385 #undef IMAGE_SCALE_CHANNELS