+ byte *src_pos = src->pixels + (y_pos >> 16) * src->row_size;
+ y_pos += y_inc;
+ memcpy(dest_pos, src_pos, dest->row_pixels_size);
+ dest_pos += dest->row_size;
+ }
+}
+
+/* Bilinear filter */
+
+UNUSED static void
+image_scale_linear_y(struct image *dest, struct image *src)
+{
+ byte *dest_row = dest->pixels;
+ /* Handle problematic special case */
+ if (src->rows == 1)
+ {
+ for (uns y_counter = dest->rows; y_counter--; dest_row += dest->row_size)
+ memcpy(dest_row, src->pixels, src->row_pixels_size);
+ return;
+ }
+ /* Initialize the main loop */
+ uns y_inc = ((src->rows - 1) << 16) / (dest->rows - 1), y_pos = 0;
+#ifdef __SSE2__
+ __m128i zero = _mm_setzero_si128();
+#endif
+ /* Main loop */
+ for (uns y_counter = dest->rows; --y_counter; )
+ {
+ uns coef = y_pos & 0xffff;
+ byte *src_row_1 = src->pixels + (y_pos >> 16) * src->row_size;
+ byte *src_row_2 = src_row_1 + src->row_size;
+ uns i = 0;
+#ifdef __SSE2__
+ /* SSE2 */
+ __m128i sse_coef = _mm_set1_epi16(coef >> 9);
+ for (; (int)i < (int)dest->row_pixels_size - 15; i += 16)
+ {
+ __m128i a2 = _mm_loadu_si128((__m128i *)(src_row_1 + i));
+ __m128i a1 = _mm_unpacklo_epi8(a2, zero);
+ a2 = _mm_unpackhi_epi8(a2, zero);
+ __m128i b2 = _mm_loadu_si128((__m128i *)(src_row_2 + i));
+ __m128i b1 = _mm_unpacklo_epi8(b2, zero);
+ b2 = _mm_unpackhi_epi8(b2, zero);
+ b1 = _mm_sub_epi16(b1, a1);
+ b2 = _mm_sub_epi16(b2, a2);
+ a1 = _mm_slli_epi16(a1, 7);
+ a2 = _mm_slli_epi16(a2, 7);
+ b1 = _mm_mullo_epi16(b1, sse_coef);
+ b2 = _mm_mullo_epi16(b2, sse_coef);
+ a1 = _mm_add_epi16(a1, b1);
+ a2 = _mm_add_epi16(a2, b2);
+ a1 = _mm_srli_epi16(a1, 7);
+ a2 = _mm_srli_epi16(a2, 7);
+ a1 = _mm_packus_epi16(a1, a2);
+ _mm_storeu_si128((__m128i *)(dest_row + i), a1);
+ }
+#endif
+ /* Unrolled loop using general-purpose registers */
+ for (; (int)i < (int)dest->row_pixels_size - 3; i += 4)
+ {
+ dest_row[i + 0] = LINEAR_INTERPOLATE(src_row_1[i + 0], src_row_2[i + 0], coef);
+ dest_row[i + 1] = LINEAR_INTERPOLATE(src_row_1[i + 1], src_row_2[i + 1], coef);
+ dest_row[i + 2] = LINEAR_INTERPOLATE(src_row_1[i + 2], src_row_2[i + 2], coef);
+ dest_row[i + 3] = LINEAR_INTERPOLATE(src_row_1[i + 3], src_row_2[i + 3], coef);
+ }
+ /* Remaining columns */
+ for (; i < dest->row_pixels_size; i++)
+ dest_row[i] = LINEAR_INTERPOLATE(src_row_1[i], src_row_2[i], coef);
+ dest_row += dest->row_size;
+ y_pos += y_inc;