X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;ds=sidebyside;f=images%2Fscale.c;h=41fda01d5be4c158b597c0fd1cbac4cec7830710;hb=09e7fe5641b94148d998a1b620bf694f353cb17b;hp=f617b01e49589678c3c11957faf7c6bf921f2123;hpb=966305189fb3c543170117da57f110a8e9851ac2;p=libucw.git

diff --git a/images/scale.c b/images/scale.c
index f617b01e..41fda01d 100644
--- a/images/scale.c
+++ b/images/scale.c
@@ -9,10 +9,21 @@
 
 #undef LOCAL_DEBUG
 
-#include "lib/lib.h"
+#include "ucw/lib.h"
 #include "images/images.h"
+#include "images/error.h"
+#include "images/math.h"
+
 #include <string.h>
 
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+#define LINEAR_INTERPOLATE(a, b, t) (((int)((a) << 16) + (int)(t) * ((int)(b) - (int)(a)) + 0x8000) >> 16)
+
+/* Generate optimized code for various pixel formats */
+
 #define IMAGE_SCALE_PREFIX(x) image_scale_1_##x
 #define IMAGE_SCALE_PIXEL_SIZE 1
 #include "images/scale-gen.h"
@@ -29,52 +40,216 @@
 #define IMAGE_SCALE_PIXEL_SIZE 4
 #include "images/scale-gen.h"
 
-int
-image_scale(struct image_thread *it, struct image *dest, struct image *src)
+/* Simple "nearest neighbour" algorithm */
+
+static void
+image_scale_nearest_xy(struct image *dest, struct image *src)
 {
-  if (src->cols < dest->cols || src->rows < dest->rows)
+  switch (src->pixel_size)
     {
-      image_thread_err(it, IMAGE_ERR_INVALID_DIMENSIONS, "Upsampling not supported.");
-      return 0;
+      case 1:
+	image_scale_1_nearest_xy(dest, src);
+	return;
+      case 2:
+	image_scale_2_nearest_xy(dest, src);
+	return;
+      case 3:
+	image_scale_3_nearest_xy(dest, src);
+	return;
+      case 4:
+	image_scale_4_nearest_xy(dest, src);
+	return;
+      default:
+	ASSERT(0);
     }
-  if ((src->flags & IMAGE_PIXEL_FORMAT) != (dest->flags & IMAGE_PIXEL_FORMAT))
+}
+
+static inline void
+image_scale_nearest_x(struct image *dest, struct image *src)
+{
+  image_scale_nearest_xy(dest, src);
+}
+
+static void
+image_scale_nearest_y(struct image *dest, struct image *src)
+{
+  uns y_inc = (src->rows << 16) / dest->rows;
+  uns y_pos = y_inc >> 1;
+  byte *dest_pos = dest->pixels;
+  for (uns row_counter = dest->rows; row_counter--; )
     {
-      image_thread_err(it, IMAGE_ERR_INVALID_PIXEL_FORMAT, "Different pixel format not supported.");
-      return 0;
+      byte *src_pos = src->pixels + (y_pos >> 16) * src->row_size;
+      y_pos += y_inc;
+      memcpy(dest_pos, src_pos, dest->row_pixels_size);
+      dest_pos += dest->row_size;
+    }
+}
+
+/* Bilinear filter */
+
+UNUSED static void
+image_scale_linear_y(struct image *dest, struct image *src)
+{
+  byte *dest_row = dest->pixels;
+  /* Handle problematic special case */
+  if (src->rows == 1)
+    {
+      for (uns y_counter = dest->rows; y_counter--; dest_row += dest->row_size)
+        memcpy(dest_row, src->pixels, src->row_pixels_size);
+      return;
+    }
+  /* Initialize the main loop */
+  uns y_inc  = ((src->rows - 1) << 16) / (dest->rows - 1), y_pos = 0;
+#ifdef __SSE2__
+  __m128i zero = _mm_setzero_si128();
+#endif
+  /* Main loop */
+  for (uns y_counter = dest->rows; --y_counter; )
+    {
+      uns coef = y_pos & 0xffff;
+      byte *src_row_1 = src->pixels + (y_pos >> 16) * src->row_size;
+      byte *src_row_2 = src_row_1 + src->row_size;
+      uns i = 0;
+#ifdef __SSE2__
+      /* SSE2 */
+      __m128i sse_coef = _mm_set1_epi16(coef >> 9);
+      for (; (int)i < (int)dest->row_pixels_size - 15; i += 16)
+        {
+	  __m128i a2 = _mm_loadu_si128((__m128i *)(src_row_1 + i));
+	  __m128i a1 = _mm_unpacklo_epi8(a2, zero);
+	  a2 = _mm_unpackhi_epi8(a2, zero);
+	  __m128i b2 = _mm_loadu_si128((__m128i *)(src_row_2 + i));
+	  __m128i b1 = _mm_unpacklo_epi8(b2, zero);
+	  b2 = _mm_unpackhi_epi8(b2, zero);
+	  b1 = _mm_sub_epi16(b1, a1);
+	  b2 = _mm_sub_epi16(b2, a2);
+	  a1 = _mm_slli_epi16(a1, 7);
+	  a2 = _mm_slli_epi16(a2, 7);
+	  b1 = _mm_mullo_epi16(b1, sse_coef);
+	  b2 = _mm_mullo_epi16(b2, sse_coef);
+	  a1 = _mm_add_epi16(a1, b1);
+	  a2 = _mm_add_epi16(a2, b2);
+	  a1 = _mm_srli_epi16(a1, 7);
+	  a2 = _mm_srli_epi16(a2, 7);
+	  a1 = _mm_packus_epi16(a1, a2);
+	  _mm_storeu_si128((__m128i *)(dest_row + i), a1);
+	}
+#endif
+      /* Unrolled loop using general-purpose registers */
+      for (; (int)i < (int)dest->row_pixels_size - 3; i += 4)
+        {
+	  dest_row[i + 0] = LINEAR_INTERPOLATE(src_row_1[i + 0], src_row_2[i + 0], coef);
+	  dest_row[i + 1] = LINEAR_INTERPOLATE(src_row_1[i + 1], src_row_2[i + 1], coef);
+	  dest_row[i + 2] = LINEAR_INTERPOLATE(src_row_1[i + 2], src_row_2[i + 2], coef);
+	  dest_row[i + 3] = LINEAR_INTERPOLATE(src_row_1[i + 3], src_row_2[i + 3], coef);
+	}
+      /* Remaining columns */
+      for (; i < dest->row_pixels_size; i++)
+	dest_row[i] = LINEAR_INTERPOLATE(src_row_1[i], src_row_2[i], coef);
+      dest_row += dest->row_size;
+      y_pos += y_inc;
     }
+  /* Always copy the last row - faster and also handle "y_pos == dest->rows * 0x10000" overflow */
+  memcpy(dest_row, src->pixels + src->image_size - src->row_size, src->row_pixels_size);
+}
+
+/* Box filter */
+
+static void
+image_scale_downsample_xy(struct image *dest, struct image *src)
+{
   switch (src->pixel_size)
     {
-      /* Gray */
       case 1:
-	image_scale_1_downsample(dest, src);
-	return 1;
-      /* GrayA */
+	image_scale_1_downsample_xy(dest, src);
+	return;
       case 2:
-	image_scale_2_downsample(dest, src);
-	return 1;
-      /* RGB */
+	image_scale_2_downsample_xy(dest, src);
+	return;
       case 3:
-	image_scale_3_downsample(dest, src);
-	return 1;
-      /* RGBA or aligned RGB */
+	image_scale_3_downsample_xy(dest, src);
+	return;
       case 4:
-	image_scale_4_downsample(dest, src);
-	return 1;
+	image_scale_4_downsample_xy(dest, src);
+	return;
       default:
 	ASSERT(0);
     }
 }
 
+/* General routine
+ * FIXME: customizable; implement at least bilinear and bicubic filters */
+
+int
+image_scale(struct image_context *ctx, struct image *dest, struct image *src)
+{
+  if ((src->flags & IMAGE_PIXEL_FORMAT) != (dest->flags & IMAGE_PIXEL_FORMAT))
+    {
+      IMAGE_ERROR(ctx, IMAGE_ERROR_INVALID_PIXEL_FORMAT, "Different pixel formats not supported.");
+      return 0;
+    }
+  if (dest->cols == src->cols)
+    {
+      if (dest->rows == src->rows)
+        {
+	  /* No scale, copy only */
+	  image_scale_nearest_y(dest, src);
+	  return 1;
+	}
+      else if (dest->rows < src->rows)
+        {
+	  /* Downscale vertically */
+	  image_scale_downsample_xy(dest, src);
+	  return 1;
+	}
+      else
+        {
+	  /* Upscale vertically */
+	  image_scale_nearest_y(dest, src);
+	  return 1;
+	}
+    }
+  else if (dest->rows == src->rows)
+    {
+      if (dest->cols < src->cols)
+        {
+          /* Downscale horizontally */
+          image_scale_downsample_xy(dest, src);
+          return 1;
+	}
+      else
+        {
+	  /* Upscale horizontally */
+	  image_scale_nearest_x(dest, src);
+	  return 1;
+	}
+    }
+  else
+    {
+      if (dest->cols <= src->cols && dest->rows <= src->rows)
+        {
+	  /* Downscale in both dimensions */
+          image_scale_downsample_xy(dest, src);
+	  return 1;
+	}
+      else
+        {
+	  image_scale_nearest_xy(dest, src);
+	  return 1;
+	}
+    }
+}
+
 void
-image_dimensions_fit_to_box(u32 *cols, u32 *rows, u32 max_cols, u32 max_rows, uns upsample)
+image_dimensions_fit_to_box(uns *cols, uns *rows, uns max_cols, uns max_rows, uns upsample)
 {
-  ASSERT(*cols && *rows && *cols <= 0xffff && *rows <= 0xffff);
-  ASSERT(max_cols && max_rows && max_cols <= 0xffff && max_rows <= 0xffff);
+  ASSERT(image_dimensions_valid(*cols, *rows));
+  ASSERT(image_dimensions_valid(max_cols, max_rows));
   if (*cols <= max_cols && *rows <= max_rows)
     {
       if (!upsample)
 	return;
-      if (max_cols / *cols > max_rows / *rows)
+      if (max_cols * *rows > max_rows * *cols)
         {
 	  *cols = *cols * max_rows / *rows;
 	  *cols = MIN(*cols, max_cols);
@@ -96,9 +271,8 @@ down_cols:
   *cols = MAX(*cols, 1);
   *rows = max_rows;
   return;
-down_rows:  
+down_rows:
   *rows = *rows * max_cols / *cols;
   *rows = MAX(*rows, 1);
   *cols = max_cols;
-  return;
 }