From 94742eb40ed0dc1608f8e163fa3f9933f2c2bd18 Mon Sep 17 00:00:00 2001
From: Pavel Charvat <pavel.charvat@netcentrum.cz>
Date: Mon, 2 Oct 2006 10:08:03 +0200
Subject: [PATCH] image_scale: - fixed visible rounding error near right and
 bottom edges - implemented simple nearest neighbour algorithm - speed
 experiments with SIMD bilinear filter...   will continue in free time

---
 images/image.c     |  12 ++-
 images/images.h    |   1 +
 images/scale-gen.h | 223 ++++++++++++++++++++++++++++++++++++++++++---
 images/scale.c     | 215 ++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 413 insertions(+), 38 deletions(-)

diff --git a/images/image.c b/images/image.c
index 48d5d6b9..2bdcd189 100644
--- a/images/image.c
+++ b/images/image.c
@@ -48,7 +48,7 @@ image_new(struct image_context *ctx, uns cols, uns rows, uns flags, struct mempo
       return NULL;
     }
   struct image *img;
-  uns pixel_size, row_size, align;
+  uns pixel_size, row_pixels_size, row_size, align;
   pixel_size = flags_to_pixel_size(flags);
   switch (pixel_size)
     {
@@ -70,8 +70,8 @@ image_new(struct image_context *ctx, uns cols, uns rows, uns flags, struct mempo
     align = pixel_size;
   else
     align = 1;
-  row_size = cols * pixel_size;
-  row_size = ALIGN(row_size, align);
+  row_pixels_size = cols * pixel_size;
+  row_size = ALIGN(row_pixels_size, align);
   u64 image_size_64 = (u64)row_size * rows;
   u64 bytes_64 = image_size_64 + (sizeof(struct image) + IMAGE_SSE_ALIGN_SIZE - 1 + sizeof(uns));
   if (unlikely(bytes_64 > image_max_bytes))
@@ -94,6 +94,7 @@ image_new(struct image_context *ctx, uns cols, uns rows, uns flags, struct mempo
   img->cols = cols;
   img->rows = rows;
   img->row_size = row_size;
+  img->row_pixels_size = row_pixels_size;
   img->image_size = image_size_64;
   DBG("img=%p flags=0x%x pixel_size=%u row_size=%u image_size=%u pixels=%p",
     img, img->flags, img->pixel_size, img->row_size, img->image_size, img->pixels);
@@ -125,10 +126,9 @@ image_clone(struct image_context *ctx, struct image *src, uns flags, struct memp
         {
           byte *s = src->pixels;
           byte *d = img->pixels;
-          uns bytes = src->cols * img->pixel_size;
 	  for (uns row = src->rows; row--; )
             {
-	      memcpy(d, s, bytes);
+	      memcpy(d, s, src->row_pixels_size);
 	      d += img->row_size;
 	      s += src->row_size;
 	    }
@@ -177,6 +177,7 @@ image_init_matrix(struct image_context *ctx, struct image *img, byte *pixels, un
   img->rows = rows;
   img->pixel_size = flags_to_pixel_size(flags);
   img->row_size = row_size;
+  img->row_pixels_size = cols * img->pixel_size;
   img->image_size = rows * row_size;
   img->flags = flags & (IMAGE_NEW_FLAGS | IMAGE_GAPS_PROTECTED);
   return img;
@@ -192,6 +193,7 @@ image_init_subimage(struct image_context *ctx UNUSED, struct image *img, struct
   img->rows = rows;
   img->pixel_size = src->pixel_size;
   img->row_size = src->row_size;
+  img->row_pixels_size = cols * src->pixel_size;
   img->image_size = src->row_size * rows;
   img->flags = src->flags & IMAGE_NEW_FLAGS;
   img->flags |= IMAGE_GAPS_PROTECTED;
diff --git a/images/images.h b/images/images.h
index 2b698dc9..46370f65 100644
--- a/images/images.h
+++ b/images/images.h
@@ -75,6 +75,7 @@ struct image {
   u32 rows;			/* number of rows */
   u32 pixel_size;		/* size of pixel (1, 2, 3 or 4) */
   u32 row_size;			/* scanline size in bytes */
+  u32 row_pixels_size;		/* scanline size in bytes excluding rows gaps */
   u32 image_size;		/* rows * row_size */
   u32 flags;			/* enum image_flag */
 };
diff --git a/images/scale-gen.h b/images/scale-gen.h
index ff51a513..fa23225d 100644
--- a/images/scale-gen.h
+++ b/images/scale-gen.h
@@ -11,15 +11,214 @@
 #  define IMAGE_SCALE_CHANNELS IMAGE_SCALE_PIXEL_SIZE
 #endif
 
+#undef IMAGE_COPY_PIXEL
+#if IMAGE_SCALE_PIXEL_SIZE == 1
+#define IMAGE_COPY_PIXEL(dest, src) do{ *(byte *)dest = *(byte *)src; }while(0)
+#elif IMAGE_SCALE_PIXEL_SIZE == 2
+#define IMAGE_COPY_PIXEL(dest, src) do{ *(u16 *)dest = *(u16 *)src; }while(0)
+#elif IMAGE_SCALE_PIXEL_SIZE == 3
+#define IMAGE_COPY_PIXEL(dest, src) do{ ((byte *)dest)[0] = ((byte *)src)[0]; ((byte *)dest)[1] = ((byte *)src)[1]; ((byte *)dest)[2] = ((byte *)src)[2]; }while(0)
+#elif IMAGE_SCALE_PIXEL_SIZE == 4
+#define IMAGE_COPY_PIXEL(dest, src) do{ *(u32 *)dest = *(u32 *)src; }while(0)
+#endif
+
+static void
+IMAGE_SCALE_PREFIX(nearest_xy)(struct image *dest, struct image *src)
+{
+  uns x_inc = (src->cols << 16) / dest->cols;
+  uns y_inc = (src->rows << 16) / dest->rows;
+  uns x_start = x_inc >> 1, x_pos;
+  uns y_pos = y_inc >> 1;
+  byte *row_start;
+# define IMAGE_WALK_PREFIX(x) walk_##x
+# define IMAGE_WALK_INLINE
+# define IMAGE_WALK_UNROLL 4
+# define IMAGE_WALK_IMAGE dest
+# define IMAGE_WALK_COL_STEP IMAGE_SCALE_PIXEL_SIZE
+# define IMAGE_WALK_DO_ROW_START do{ row_start = src->pixels + (y_pos >> 16) * src->row_size; y_pos += y_inc; x_pos = x_start; }while(0)
+# define IMAGE_WALK_DO_STEP do{ byte *pos = row_start + (x_pos >> 16) * IMAGE_SCALE_PIXEL_SIZE; x_pos += x_inc; IMAGE_COPY_PIXEL(walk_pos, pos); }while(0)
+# include "images/image-walk.h"
+}
+
+#if 0 /* Experiments with rearranging pixels for SSE... */
+static void
+IMAGE_SCALE_PREFIX(linear_x)(struct image *dest, struct image *src)
+{
+  /* Handle problematic special case */
+  byte *src_row = src->pixels;
+  byte *dest_row = dest->pixels;
+  if (src->cols == 1)
+    {
+      for (uns y_counter = dest->rows; y_counter--; )
+        {
+          // FIXME
+          ASSERT(0);
+	  src_row += src->row_size;
+	  dest_row += dest->row_size;
+	}
+      return;
+    }
+  /* Initialize the main loop */
+  uns x_inc = ((src->cols - 1) << 16) / (dest->cols - 1);
+# define COLS_AT_ONCE 256
+  byte pixel_buf[COLS_AT_ONCE * 2 * IMAGE_SCALE_PIXEL_SIZE]; /* Buffers should fit in cache */
+  u16 coef_buf[COLS_AT_ONCE * IMAGE_SCALE_PIXEL_SIZE];
+  /* Main loop */
+  for (uns y_counter = dest->rows; y_counter--; )
+    {
+      uns x_pos = 0;
+      byte *dest_pos = dest_row;
+      for (uns x_counter = dest->cols; --x_counter; )
+      for (uns x_counter = dest->cols; x_counter > COLS_AT_ONCE; x_counter -= COLS_AT_ONCE)
+        {
+	  byte *pixel_buf_pos = pixel_buf;
+	  u16 *coef_buf_pos = coef_buf;
+	  for (uns i = 0; i < COLS_AT_ONCE / 2; i++)
+	    {
+	      byte *src_pos = src_row + (x_pos >> 16) * IMAGE_SCALE_PIXEL_SIZE;
+	      uns ofs = x_pos & 0xffff;
+	      x_pos += x_inc;
+	      byte *src_pos_2 = src_row + (x_pos >> 16) * IMAGE_SCALE_PIXEL_SIZE;
+	      uns ofs_2 = x_pos & 0xffff;
+	      x_pos += x_inc;
+	      *coef_buf_pos++ = ofs;
+	      byte *pixel_buf_pos_2 = pixel_buf_pos + IMAGE_SCALE_PIXEL_SIZE;
+	      byte *pixel_buf_pos_3 = pixel_buf_pos + IMAGE_SCALE_PIXEL_SIZE * 2;
+	      byte *pixel_buf_pos_4 = pixel_buf_pos + IMAGE_SCALE_PIXEL_SIZE * 3;
+              IMAGE_COPY_PIXEL(pixel_buf_pos, src_pos);
+	      IMAGE_COPY_PIXEL(pixel_buf_pos_2, src_pos + IMAGE_SCALE_PIXEL_SIZE);
+              IMAGE_COPY_PIXEL(pixel_buf_pos_3, src_pos_2);
+	      IMAGE_COPY_PIXEL(pixel_buf_pos_4, src_pos_2 + IMAGE_SCALE_PIXEL_SIZE);
+	      pixel_buf_pos += 4 * IMAGE_SCALE_PIXEL_SIZE;
+	      *coef_buf_pos++ = ofs_2;
+	    }
+/*
+	  byte *src_pos = src_row + (x_pos >> 16) * IMAGE_SCALE_PIXEL_SIZE;
+	  uns ofs = x_pos & 0xffff;
+	  x_pos += x_inc;
+	  dest_pos[0] = LINEAR_INTERPOLATE(src_pos[0], src_pos[0 + IMAGE_SCALE_PIXEL_SIZE], ofs);
+#	  if IMAGE_SCALE_CHANNELS >= 2
+	  dest_pos[1] = LINEAR_INTERPOLATE(src_pos[1], src_pos[1 + IMAGE_SCALE_PIXEL_SIZE], ofs);
+#	  endif
+#	  if IMAGE_SCALE_CHANNELS >= 3
+	  dest_pos[2] = LINEAR_INTERPOLATE(src_pos[2], src_pos[2 + IMAGE_SCALE_PIXEL_SIZE], ofs);
+#	  endif
+#	  if IMAGE_SCALE_CHANNELS >= 4
+	  dest_pos[3] = LINEAR_INTERPOLATE(src_pos[3], src_pos[3 + IMAGE_SCALE_PIXEL_SIZE], ofs);
+#	  endif
+	  dest_pos += IMAGE_SCALE_PIXEL_SIZE;*/
+
+	}
+      /* Always copy the last column - handle "x_pos == dest->cols * 0x10000" overflow */
+      IMAGE_COPY_PIXEL(dest_pos, src_row + src->row_pixels_size - IMAGE_SCALE_PIXEL_SIZE);
+      /* Next step */
+      src_row += src->row_size;
+      dest_row += dest->row_size;
+    }
+#undef COLS_AT_ONCE
+}
+
+static void
+IMAGE_SCALE_PREFIX(bilinear_xy)(struct image *dest, struct image *src)
+{
+  uns x_inc = (((src->cols - 1) << 16) - 1) / (dest->cols);
+  uns y_inc = (((src->rows - 1) << 16) - 1) / (dest->rows);
+  uns y_pos = 0x10000;
+  byte *cache[2], buf1[dest->row_pixels_size + 16], buf2[dest->row_pixels_size + 16], *pbuf[2];
+  byte *dest_row = dest->pixels, *dest_pos;
+  uns cache_index = ~0U, cache_i = 0;
+  pbuf[0] = cache[0] = ALIGN_PTR((void *)buf1, 16);
+  pbuf[1] = cache[1] = ALIGN_PTR((void *)buf2, 16);
+#ifdef __SSE2__
+  __m128i zero = _mm_setzero_si128();
+#endif
+  for (uns row_counter = dest->rows; row_counter--; )
+    {
+      dest_pos = dest_row;
+      uns y_index = y_pos >> 16;
+      uns y_ofs = y_pos & 0xffff;
+      y_pos += y_inc;
+      uns x_pos = 0;
+      if (y_index > (uns)(cache_index + 1))
+	cache_index = y_index - 1;
+      while (y_index > cache_index)
+        {
+	  cache[0] = cache[1];
+	  cache[1] = pbuf[cache_i ^= 1];
+	  cache_index++;
+	  byte *src_row = src->pixels + cache_index * src->row_size;
+	  byte *cache_pos = cache[1];
+	  for (uns col_counter = dest->cols; --col_counter; )
+	    {
+	      byte *c1 = src_row + (x_pos >> 16) * IMAGE_SCALE_PIXEL_SIZE;
+	      byte *c2 = c1 + IMAGE_SCALE_PIXEL_SIZE;
+	      uns ofs = x_pos & 0xffff;
+	      cache_pos[0] = LINEAR_INTERPOLATE(c1[0], c2[0], ofs);
+#	      if IMAGE_SCALE_CHANNELS >= 2
+	      cache_pos[1] = LINEAR_INTERPOLATE(c1[1], c2[1], ofs);
+#	      endif
+#	      if IMAGE_SCALE_CHANNELS >= 3
+	      cache_pos[2] = LINEAR_INTERPOLATE(c1[2], c2[2], ofs);
+#	      endif
+#	      if IMAGE_SCALE_CHANNELS >= 4
+	      cache_pos[3] = LINEAR_INTERPOLATE(c1[3], c2[3], ofs);
+#	      endif
+	      cache_pos += IMAGE_SCALE_PIXEL_SIZE;
+	      x_pos += x_inc;
+	    }
+	  IMAGE_COPY_PIXEL(cache_pos, src_row + src->row_pixels_size - IMAGE_SCALE_PIXEL_SIZE);
+	}
+      uns i = 0;
+#ifdef __SSE2__
+      __m128i coef = _mm_set1_epi16(y_ofs >> 9);
+      for (; (int)i < (int)dest->row_pixels_size - 15; i += 16)
+        {
+	  __m128i a2 = _mm_loadu_si128((__m128i *)(cache[0] + i));
+	  __m128i a1 = _mm_unpacklo_epi8(a2, zero);
+	  a2 = _mm_unpackhi_epi8(a2, zero);
+	  __m128i b2 = _mm_loadu_si128((__m128i *)(cache[1] + i));
+	  __m128i b1 = _mm_unpacklo_epi8(b2, zero);
+	  b2 = _mm_unpackhi_epi8(b2, zero);
+	  b1 = _mm_sub_epi16(b1, a1);
+	  b2 = _mm_sub_epi16(b2, a2);
+	  a1 = _mm_slli_epi16(a1, 7);
+	  a2 = _mm_slli_epi16(a2, 7);
+	  b1 = _mm_mullo_epi16(b1, coef);
+	  b2 = _mm_mullo_epi16(b2, coef);
+	  a1 = _mm_add_epi16(a1, b1);
+	  a2 = _mm_add_epi16(a2, b2);
+	  a1 = _mm_srli_epi16(a1, 7);
+	  a2 = _mm_srli_epi16(a2, 7);
+	  a1 = _mm_packus_epi16(a1, a2);
+	  _mm_storeu_si128((__m128i *)(dest_pos + i), a1);
+	}
+#elif 1
+      for (; (int)i < (int)dest->row_pixels_size - 3; i += 4)
+        {
+	  dest_pos[i + 0] = LINEAR_INTERPOLATE(cache[0][i + 0], cache[1][i + 0], y_ofs);
+	  dest_pos[i + 1] = LINEAR_INTERPOLATE(cache[0][i + 1], cache[1][i + 1], y_ofs);
+	  dest_pos[i + 2] = LINEAR_INTERPOLATE(cache[0][i + 2], cache[1][i + 2], y_ofs);
+	  dest_pos[i + 3] = LINEAR_INTERPOLATE(cache[0][i + 3], cache[1][i + 3], y_ofs);
+	}
+#endif
+      for (; i < dest->row_pixels_size; i++)
+	dest_pos[i] = LINEAR_INTERPOLATE(cache[0][i], cache[1][i], y_ofs);
+      dest_row += dest->row_size;
+    }
+}
+#endif
+
 static void
-IMAGE_SCALE_PREFIX(downsample)(struct image *dest, struct image *src)
+IMAGE_SCALE_PREFIX(downsample_xy)(struct image *dest, struct image *src)
 {
   /* FIXME slow */
   byte *rsrc = src->pixels, *psrc;
   byte *rdest = dest->pixels, *pdest;
-  uns x_inc = (dest->cols << 16) / src->cols, x_pos, x_inc_frac = 0xffffff / x_inc;
-  uns y_inc = (dest->rows << 16) / src->rows, y_pos = 0, y_inc_frac = 0xffffff / y_inc;
-  uns final_mul = ((u64)x_inc * y_inc) >> 16;
+  u64 x_inc = ((u64)dest->cols << 32) / src->cols, x_pos;
+  u64 y_inc = ((u64)dest->rows << 32) / src->rows, y_pos = 0;
+  uns x_inc_frac = (u64)0xffffffffff / x_inc;
+  uns y_inc_frac = (u64)0xffffffffff / y_inc;
+  uns final_mul = ((u64)(x_inc >> 16) * (y_inc >> 16)) >> 16;
   uns buf_size = dest->cols * IMAGE_SCALE_CHANNELS;
   u32 buf[buf_size], *pbuf;
   buf_size *= sizeof(u32);
@@ -36,7 +235,7 @@ IMAGE_SCALE_PREFIX(downsample)(struct image *dest, struct image *src)
           for (uns cols_counter = src->cols; cols_counter--; )
             {
 	      x_pos += x_inc;
-	      if (x_pos <= 0x10000)
+	      if (x_pos <= 0x100000000)
 	        {
 	          pbuf[0] += psrc[0];
 #		  if IMAGE_SCALE_CHANNELS >= 2
@@ -51,8 +250,8 @@ IMAGE_SCALE_PREFIX(downsample)(struct image *dest, struct image *src)
 	        }
 	      else
 	        {
-	          x_pos -= 0x10000;
-	          uns mul2 = x_pos * x_inc_frac;
+	          x_pos -= 0x100000000;
+	          uns mul2 = (uns)(x_pos >> 16) * x_inc_frac;
 	          uns mul1 = 0xffffff - mul2;
 	          pbuf[0] += (psrc[0] * mul1) >> 24;
 	          pbuf[0 + IMAGE_SCALE_CHANNELS] += (psrc[0] * mul2) >> 24;
@@ -75,10 +274,10 @@ IMAGE_SCALE_PREFIX(downsample)(struct image *dest, struct image *src)
 	}
       else
         {
-	  y_pos -= 0x10000;
+	  y_pos -= 0x100000000;
           pdest = rdest;
           rdest += dest->row_size;
-	  uns mul2 = y_pos * y_inc_frac;
+	  uns mul2 = (uns)(y_pos >> 16) * y_inc_frac;
 	  uns mul1 = 0xffffff - mul2;
 	  uns a0 = 0;
 #	  if IMAGE_SCALE_CHANNELS >= 2
@@ -93,7 +292,7 @@ IMAGE_SCALE_PREFIX(downsample)(struct image *dest, struct image *src)
           for (uns cols_counter = src->cols; cols_counter--; )
             {
 	      x_pos += x_inc;
-	      if (x_pos <= 0x10000)
+	      if (x_pos <= 0x100000000)
 	        {
 		  pbuf[0] += ((psrc[0] * mul1) >> 24);
 		  a0 += (psrc[0] * mul2) >> 24;
@@ -112,8 +311,8 @@ IMAGE_SCALE_PREFIX(downsample)(struct image *dest, struct image *src)
 	        }
 	      else
 	        {
-	          x_pos -= 0x10000;
-		  uns mul4 = x_pos * x_inc_frac;
+	          x_pos -= 0x100000000;
+		  uns mul4 = (uns)(x_pos >> 16) * x_inc_frac;
 		  uns mul3 = 0xffffff - mul4;
 		  uns mul13 = ((u64)mul1 * mul3) >> 24;
 		  uns mul23 = ((u64)mul2 * mul3) >> 24;
diff --git a/images/scale.c b/images/scale.c
index e987e0f5..925733a1 100644
--- a/images/scale.c
+++ b/images/scale.c
@@ -12,9 +12,18 @@
 #include "lib/lib.h"
 #include "images/images.h"
 #include "images/error.h"
+#include "images/math.h"
 
 #include <string.h>
 
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+#define LINEAR_INTERPOLATE(a, b, t) (((int)((a) << 16) + (int)(t) * ((int)(b) - (int)(a)) + 0x8000) >> 16)
+
+/* Generate optimized code for various pixel formats */
+
 #define IMAGE_SCALE_PREFIX(x) image_scale_1_##x
 #define IMAGE_SCALE_PIXEL_SIZE 1
 #include "images/scale-gen.h"
@@ -31,42 +40,206 @@
 #define IMAGE_SCALE_PIXEL_SIZE 4
 #include "images/scale-gen.h"
 
-int
-image_scale(struct image_context *ctx, struct image *dest, struct image *src)
+/* Simple "nearest neighbour" algorithm */
+
+static void
+image_scale_nearest_xy(struct image *dest, struct image *src)
 {
-  if (src->cols < dest->cols || src->rows < dest->rows)
+  switch (src->pixel_size)
     {
-      IMAGE_ERROR(ctx, IMAGE_ERROR_INVALID_DIMENSIONS, "Upsampling not supported.");
-      return 0;
+      case 1:
+	image_scale_1_nearest_xy(dest, src);
+	return;
+      case 2:
+	image_scale_2_nearest_xy(dest, src);
+	return;
+      case 3:
+	image_scale_3_nearest_xy(dest, src);
+	return;
+      case 4:
+	image_scale_4_nearest_xy(dest, src);
+	return;
+      default:
+	ASSERT(0);
     }
-  if ((src->flags & IMAGE_PIXEL_FORMAT) != (dest->flags & IMAGE_PIXEL_FORMAT))
+}
+
+static inline void
+image_scale_nearest_x(struct image *dest, struct image *src)
+{
+  image_scale_nearest_xy(dest, src);
+}
+
+static void
+image_scale_nearest_y(struct image *dest, struct image *src)
+{
+  uns y_inc = (src->rows << 16) / dest->rows;
+  uns y_pos = y_inc >> 1;
+  byte *dest_pos = dest->pixels;
+  for (uns row_counter = dest->rows; row_counter--; )
     {
-      IMAGE_ERROR(ctx, IMAGE_ERROR_INVALID_PIXEL_FORMAT, "Different pixel format not supported.");
-      return 0;
+      byte *src_pos = src->pixels + (y_pos >> 16) * src->row_size;
+      y_pos += y_inc;
+      memcpy(dest_pos, src_pos, dest->row_pixels_size);
+      dest_pos += dest->row_size;
+    }
+}
+
+/* Bilinear filter */
+
+UNUSED static void
+image_scale_linear_y(struct image *dest, struct image *src)
+{
+  byte *dest_row = dest->pixels;
+  /* Handle problematic special case */
+  if (src->rows == 1)
+    {
+      for (uns y_counter = dest->rows; y_counter--; dest_row += dest->row_size)
+        memcpy(dest_row, src->pixels, src->row_pixels_size);
+      return;
+    }
+  /* Initialize the main loop */
+  uns y_inc  = ((src->rows - 1) << 16) / (dest->rows - 1), y_pos = 0;
+#ifdef __SSE2__
+  __m128i zero = _mm_setzero_si128();
+#endif
+  /* Main loop */
+  for (uns y_counter = dest->rows; --y_counter; )
+    {
+      uns coef = y_pos & 0xffff;
+      byte *src_row_1 = src->pixels + (y_pos >> 16) * src->row_size;
+      byte *src_row_2 = src_row_1 + src->row_size;
+      uns i = 0;
+#ifdef __SSE2__
+      /* SSE2 */
+      __m128i sse_coef = _mm_set1_epi16(coef >> 9);
+      for (; (int)i < (int)dest->row_pixels_size - 15; i += 16)
+        {
+	  __m128i a2 = _mm_loadu_si128((__m128i *)(src_row_1 + i));
+	  __m128i a1 = _mm_unpacklo_epi8(a2, zero);
+	  a2 = _mm_unpackhi_epi8(a2, zero);
+	  __m128i b2 = _mm_loadu_si128((__m128i *)(src_row_2 + i));
+	  __m128i b1 = _mm_unpacklo_epi8(b2, zero);
+	  b2 = _mm_unpackhi_epi8(b2, zero);
+	  b1 = _mm_sub_epi16(b1, a1);
+	  b2 = _mm_sub_epi16(b2, a2);
+	  a1 = _mm_slli_epi16(a1, 7);
+	  a2 = _mm_slli_epi16(a2, 7);
+	  b1 = _mm_mullo_epi16(b1, sse_coef);
+	  b2 = _mm_mullo_epi16(b2, sse_coef);
+	  a1 = _mm_add_epi16(a1, b1);
+	  a2 = _mm_add_epi16(a2, b2);
+	  a1 = _mm_srli_epi16(a1, 7);
+	  a2 = _mm_srli_epi16(a2, 7);
+	  a1 = _mm_packus_epi16(a1, a2);
+	  _mm_storeu_si128((__m128i *)(dest_row + i), a1);
+	}
+#endif
+      /* Unrolled loop using general-purpose registers */
+      for (; (int)i < (int)dest->row_pixels_size - 3; i += 4)
+        {
+	  dest_row[i + 0] = LINEAR_INTERPOLATE(src_row_1[i + 0], src_row_2[i + 0], coef);
+	  dest_row[i + 1] = LINEAR_INTERPOLATE(src_row_1[i + 1], src_row_2[i + 1], coef);
+	  dest_row[i + 2] = LINEAR_INTERPOLATE(src_row_1[i + 2], src_row_2[i + 2], coef);
+	  dest_row[i + 3] = LINEAR_INTERPOLATE(src_row_1[i + 3], src_row_2[i + 3], coef);
+	}
+      /* Remaining columns */
+      for (; i < dest->row_pixels_size; i++)
+	dest_row[i] = LINEAR_INTERPOLATE(src_row_1[i], src_row_2[i], coef);
+      dest_row += dest->row_size;
+      y_pos += y_inc;
     }
+  /* Always copy the last row - faster and also handle "y_pos == dest->rows * 0x10000" overflow */
+  memcpy(dest_row, src->pixels + src->image_size - src->row_size, src->row_pixels_size);
+}
+
+/* Box filter */
+
+static void
+image_scale_downsample_xy(struct image *dest, struct image *src)
+{
   switch (src->pixel_size)
     {
-      /* Gray */
       case 1:
-	image_scale_1_downsample(dest, src);
-	return 1;
-      /* GrayA */
+	image_scale_1_downsample_xy(dest, src);
+	return;
       case 2:
-	image_scale_2_downsample(dest, src);
-	return 1;
-      /* RGB */
+	image_scale_2_downsample_xy(dest, src);
+	return;
       case 3:
-	image_scale_3_downsample(dest, src);
-	return 1;
-      /* RGBA or aligned RGB */
+	image_scale_3_downsample_xy(dest, src);
+	return;
       case 4:
-	image_scale_4_downsample(dest, src);
-	return 1;
+	image_scale_4_downsample_xy(dest, src);
+	return;
       default:
 	ASSERT(0);
     }
 }
 
+/* General routine
+ * FIXME: customizable; implement at least bilinear and bicubic filters */
+
+int
+image_scale(struct image_context *ctx, struct image *dest, struct image *src)
+{
+  if ((src->flags & IMAGE_PIXEL_FORMAT) != (dest->flags & IMAGE_PIXEL_FORMAT))
+    {
+      IMAGE_ERROR(ctx, IMAGE_ERROR_INVALID_PIXEL_FORMAT, "Different pixel formats not supported.");
+      return 0;
+    }
+  if (dest->cols == src->cols)
+    {
+      if (dest->rows == src->rows)
+        {
+	  /* No scale, copy only */
+	  image_scale_nearest_y(dest, src);
+	  return 1;
+	}
+      else if (dest->rows < src->rows)
+        {
+	  /* Downscale vertically */
+	  image_scale_downsample_xy(dest, src);
+	  return 1;
+	}
+      else
+        {
+	  /* Upscale vertically */
+	  image_scale_nearest_y(dest, src);
+	  return 1;
+	}
+    }
+  else if (dest->rows == src->rows)
+    {
+      if (dest->cols < src->cols)
+        {
+          /* Downscale horizontally */
+          image_scale_downsample_xy(dest, src);
+          return 1;
+	}
+      else
+        {
+	  /* Upscale horizontally */
+	  image_scale_nearest_x(dest, src);
+	  return 1;
+	}
+    }
+  else
+    {
+      if (dest->cols <= src->cols && src->cols <= dest->cols)
+        {
+	  /* Downscale in both dimensions */
+          image_scale_downsample_xy(dest, src);
+	  return 1;
+	}
+      else
+        {
+	  image_scale_nearest_xy(dest, src);
+	  return 1;
+	}
+    }
+}
+
 void
 image_dimensions_fit_to_box(u32 *cols, u32 *rows, u32 max_cols, u32 max_rows, uns upsample)
 {
@@ -98,7 +271,7 @@ down_cols:
   *cols = MAX(*cols, 1);
   *rows = max_rows;
   return;
-down_rows:  
+down_rows:
   *rows = *rows * max_cols / *cols;
   *rows = MAX(*rows, 1);
   *cols = max_cols;
-- 
2.47.3