From bc0bc688c278231eaa9cd270c4afad69ff64a83f Mon Sep 17 00:00:00 2001
From: Peter de Rivaz <peter.derivaz@gmail.com>
Date: Thu, 1 May 2014 12:00:50 +0100
Subject: [PATCH] Added 16bit scaling functions to libyuv

This is an unoptimized C implementation of a 16bit
scaling function.

Change-Id: I4241442dde3cbf347988c555776a5cdd0189bb4d
---
 third_party/libyuv/include/libyuv/scale.h |  10 +
 third_party/libyuv/source/scale.c         | 481 ++++++++++++++++++++++
 2 files changed, 491 insertions(+)

diff --git a/third_party/libyuv/include/libyuv/scale.h b/third_party/libyuv/include/libyuv/scale.h
index 21fe360ce..18082bc57 100644
--- a/third_party/libyuv/include/libyuv/scale.h
+++ b/third_party/libyuv/include/libyuv/scale.h
@@ -45,6 +45,16 @@ int I420Scale(const uint8* src_y, int src_stride_y,
               int dst_width, int dst_height,
               FilterMode filtering);
 
+int I42016Scale(const uint16* src_y, int src_stride_y,
+                const uint16* src_u, int src_stride_u,
+                const uint16* src_v, int src_stride_v,
+                int src_width, int src_height,
+                uint16* dst_y, int dst_stride_y,
+                uint16* dst_u, int dst_stride_u,
+                uint16* dst_v, int dst_stride_v,
+                int dst_width, int dst_height,
+                FilterMode filtering);
+
 // Legacy API.  Deprecated
 int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
           int src_stride_y, int src_stride_u, int src_stride_v,
diff --git a/third_party/libyuv/source/scale.c b/third_party/libyuv/source/scale.c
index 3c30b55c6..b9b57024d 100644
--- a/third_party/libyuv/source/scale.c
+++ b/third_party/libyuv/source/scale.c
@@ -3878,6 +3878,487 @@ int ScaleOffset(const uint8* src, int src_width, int src_height,
   }
 }
 
+// 16bit scaling functions
+__inline static uint32 SumBox16(int iboxwidth, int iboxheight,
+                                int src_stride, const uint16* src_ptr) {
+  int x, y;
+  uint32 sum;
+  assert(iboxwidth > 0);
+  assert(iboxheight > 0);
+  sum = 0u;
+  for (y = 0; y < iboxheight; ++y) {
+    for (x = 0; x < iboxwidth; ++x) {
+      sum += src_ptr[x];
+    }
+    src_ptr += src_stride;
+  }
+  return sum;
+}
+
+static void ScalePlaneBoxRow16(int dst_width, int boxheight,
+                               int dx, int src_stride,
+                               const uint16* src_ptr, uint16* dst_ptr) {
+  int x = 0;
+  int i;
+  for (i = 0; i < dst_width; ++i) {
+    int ix = x >> 16;
+    int boxwidth;
+    x += dx;
+    boxwidth = (x >> 16) - ix;
+    *dst_ptr++ = SumBox16(boxwidth, boxheight, src_stride, src_ptr + ix) /
+        (boxwidth * boxheight);
+  }
+}
+
+__inline static uint32 SumPixels32(int iboxwidth, const uint32* src_ptr) {
+  uint32 sum;
+  int x;
+  assert(iboxwidth > 0);
+  sum = 0u;
+  for (x = 0; x < iboxwidth; ++x) {
+    sum += src_ptr[x];
+  }
+  return sum;
+}
+
+static void ScaleAddCols2_16_C(int dst_width, int boxheight, int dx,
+                               const uint32* src_ptr, uint16* dst_ptr) {
+  int scaletbl[2];
+  int minboxwidth = (dx >> 16);
+  scaletbl[0] = 65536 / (minboxwidth * boxheight);
+  scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
+  {
+  int *scaleptr = scaletbl - minboxwidth;
+  int x = 0;
+    int i;
+    for (i = 0; i < dst_width; ++i) {
+    int ix = x >> 16;
+      int boxwidth;
+    x += dx;
+      boxwidth = (x >> 16) - ix;
+    *dst_ptr++ = SumPixels32(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
+    }
+  }
+}
+
+static void ScaleAddCols1_16_C(int dst_width, int boxheight, int dx,
+                               const uint32* src_ptr, uint16* dst_ptr) {
+  int boxwidth = (dx >> 16);
+  int scaleval = 65536 / (boxwidth * boxheight);
+  int x = 0;
+  int i;
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = SumPixels32(boxwidth, src_ptr + x) * scaleval >> 16;
+    x += boxwidth;
+  }
+}
+
+static void ScaleAddRows16_C(const uint16* src_ptr, int src_stride,
+                             uint32* dst_ptr, int src_width, int src_height) {
+  int x, y;
+  assert(src_width > 0);
+  assert(src_height > 0);
+  for (x = 0; x < src_width; ++x) {
+    const uint16* s = src_ptr + x;
+    int sum = 0;
+    for (y = 0; y < src_height; ++y) {
+      sum += s[0];
+      s += src_stride;
+    }
+    dst_ptr[x] = sum;
+  }
+}
+
+/**
+ * Scale plane down to any dimensions, with interpolation.
+ * (boxfilter).
+ *
+ * Same method as SimpleScale, which is fixed point, outputting
+ * one pixel of destination using fixed point (16.16) to step
+ * through source, sampling a box of pixel with simple
+ * averaging.
+ */
+static void ScalePlaneBox16(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint16* src_ptr, uint16* dst_ptr) {
+  int dx, dy;
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  dy = (src_height << 16) / dst_height;
+  dx = (src_width << 16) / dst_width;
+  if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) ||
+      dst_height * 2 > src_height) {
+    uint16* dst = dst_ptr;
+    int dy = (src_height << 16) / dst_height;
+    int dx = (src_width << 16) / dst_width;
+    int y = 0;
+    int j;
+    for (j = 0; j < dst_height; ++j) {
+      int iy = y >> 16;
+      const uint16* const src = src_ptr + iy * src_stride;
+      int boxheight;
+      y += dy;
+      if (y > (src_height << 16)) {
+        y = (src_height << 16);
+      }
+      boxheight = (y >> 16) - iy;
+      ScalePlaneBoxRow16(dst_width, boxheight,
+                         dx, src_stride,
+                         src, dst);
+
+      dst += dst_stride;
+    }
+  } else {
+    ALIGN16(uint32 row[kMaxInputWidth]);
+
+    void (*ScaleAddCols)(int dst_width, int boxheight, int dx,
+                         const uint32* src_ptr, uint16* dst_ptr);
+
+    if (dx & 0xffff) {
+      ScaleAddCols = ScaleAddCols2_16_C;
+    } else {
+      ScaleAddCols = ScaleAddCols1_16_C;
+    }
+
+    {
+    int y = 0;
+      int j;
+      for (j = 0; j < dst_height; ++j) {
+      int iy = y >> 16;
+      const uint16* const src = src_ptr + iy * src_stride;
+        int boxheight;
+      y += dy;
+      if (y > (src_height << 16)) {
+        y = (src_height << 16);
+      }
+        boxheight = (y >> 16) - iy;
+      ScaleAddRows16_C(src, src_stride, row, src_width, boxheight);
+      ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr);
+      dst_ptr += dst_stride;
+      }
+    }
+  }
+}
+
+/**
+ * Scale plane to/from any dimensions, with interpolation.
+ */
+static void ScalePlaneBilinearSimple16(int src_width, int src_height,
+                                       int dst_width, int dst_height,
+                                       int src_stride, int dst_stride,
+                                       const uint16* src_ptr, uint16* dst_ptr) {
+  int i, j;
+  uint16* dst = dst_ptr;
+  int dx = (src_width << 16) / dst_width;
+  int dy = (src_height << 16) / dst_height;
+  int maxx = ((src_width - 1) << 16) - 1;
+  int maxy = ((src_height - 1) << 16) - 1;
+  int y = (dst_height < src_height) ? 32768 :
+      (src_height << 16) / dst_height - 32768;
+  for (i = 0; i < dst_height; ++i) {
+    int cy = (y < 0) ? 0 : y;
+    int yi = cy >> 16;
+    int yf = cy & 0xffff;
+    const uint16* const src = src_ptr + yi * src_stride;
+    int x = (dst_width < src_width) ? 32768 :
+        (src_width << 16) / dst_width - 32768;
+    for (j = 0; j < dst_width; ++j) {
+      int cx = (x < 0) ? 0 : x;
+      int xi = cx >> 16;
+      int xf = cx & 0xffff;
+      int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16;
+      int r1 = (src[xi + src_stride] * (65536 - xf) +
+          src[xi + src_stride + 1] * xf) >> 16;
+      *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16;
+      x += dx;
+      if (x > maxx)
+        x = maxx;
+    }
+    dst += dst_stride - dst_width;
+    y += dy;
+    if (y > maxy)
+      y = maxy;
+  }
+}
+
+static void ScaleFilterCols16_C(uint16* dst_ptr, const uint16* src_ptr,
+                                int dst_width, int dx) {
+  int x = 0;
+  int j;
+  for (j = 0; j < dst_width; ++j) {
+    int xi = x >> 16;
+    int xf1 = x & 0xffff;
+    int xf0 = 65536 - xf1;
+
+    *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16;
+    x += dx;
+  }
+}
+
+// C version 8x2 -> 8x1
+static void ScaleFilterRows16_C(uint16* dst_ptr,
+                                const uint16* src_ptr, int src_stride,
+                                int dst_width, int source_y_fraction) {
+  int y1_fraction;
+  int y0_fraction;
+  const uint16* src_ptr1;
+  uint16* end;
+  assert(dst_width > 0);
+  y1_fraction = source_y_fraction;
+  y0_fraction = 256 - y1_fraction;
+  src_ptr1 = src_ptr + src_stride;
+  end = dst_ptr + dst_width;
+  do {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+    dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
+    dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
+    dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
+    dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
+    dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
+    dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
+    src_ptr += 8;
+    src_ptr1 += 8;
+    dst_ptr += 8;
+  } while (dst_ptr < end);
+  dst_ptr[0] = dst_ptr[-1];
+}
+
+/**
+ * Scale plane to/from any dimensions, with bilinear
+ * interpolation.
+ */
+static void ScalePlaneBilinear16(int src_width, int src_height,
+                                 int dst_width, int dst_height,
+                                 int src_stride, int dst_stride,
+                                 const uint16* src_ptr, uint16* dst_ptr) {
+  int dy;
+  int dx;
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  dy = (src_height << 16) / dst_height;
+  dx = (src_width << 16) / dst_width;
+  if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) {
+    ScalePlaneBilinearSimple16(src_width, src_height, dst_width, dst_height,
+                               src_stride, dst_stride, src_ptr, dst_ptr);
+
+  } else {
+    ALIGN16(uint16 row[kMaxInputWidth + 1]);
+
+    int y = 0;
+    int maxy = ((src_height - 1) << 16) - 1;  // max is filter of last 2 rows.
+    int j;
+    for (j = 0; j < dst_height; ++j) {
+      int iy = y >> 16;
+      int fy = (y >> 8) & 255;
+      const uint16* const src = src_ptr + iy * src_stride;
+      ScaleFilterRows16_C(row, src, src_stride, src_width, fy);
+      ScaleFilterCols16_C(dst_ptr, row, dst_width, dx);
+      dst_ptr += dst_stride;
+      y += dy;
+      if (y > maxy) {
+        y = maxy;
+      }
+    }
+  }
+}
+
+/**
+ * Scale plane to/from any dimensions, without interpolation.
+ * Fixed point math is used for performance: The upper 16 bits
+ * of x and dx is the integer part of the source position and
+ * the lower 16 bits are the fixed decimal part.
+ */
+static void ScalePlaneSimple16(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               int src_stride, int dst_stride,
+                               const uint16* src_ptr, uint16* dst_ptr) {
+  uint16* dst = dst_ptr;
+  int dx = (src_width << 16) / dst_width;
+  int y;
+  for (y = 0; y < dst_height; ++y) {
+    const uint16* const src = src_ptr + (y * src_height / dst_height) *
+        src_stride;
+    // TODO(fbarchard): Round X coordinate by setting x=0x8000.
+    int x = 0;
+    int i;
+    for (i = 0; i < dst_width; ++i) {
+      *dst++ = src[x >> 16];
+      x += dx;
+    }
+    dst += dst_stride - dst_width;
+  }
+}
+
+/**
+ * Scale plane down, any size
+ *
+ * This is an optimized version for scaling down a plane to any size.
+ * The current implementation is ~10 times faster compared to the
+ * reference implementation for e.g. XGA->LowResPAL
+ *
+ */
+static void ScalePlaneDown16(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint16* src_ptr, uint16* dst_ptr,
+                             FilterMode filtering) {
+  if (!filtering) {
+    ScalePlaneSimple16(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src_ptr, dst_ptr);
+  } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) {
+    // between 1/2x and 1x use bilinear
+    ScalePlaneBilinear16(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src_ptr, dst_ptr);
+  } else {
+    ScalePlaneBox16(src_width, src_height, dst_width, dst_height,
+                    src_stride, dst_stride, src_ptr, dst_ptr);
+  }
+}
+
+/**
+ * Scale plane to/from any dimensions.
+ */
+static void ScalePlaneAnySize16(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint16* src_ptr, uint16* dst_ptr,
+                                FilterMode filtering) {
+  if (!filtering) {
+    ScalePlaneSimple16(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src_ptr, dst_ptr);
+  } else {
+    // fall back to non-optimized version
+    ScalePlaneBilinear16(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src_ptr, dst_ptr);
+  }
+}
+
+static void CopyPlane16(int src_width, int src_height,
+                        int dst_width, int dst_height,
+                        int src_stride, int dst_stride,
+                        const uint16* src_ptr, uint16* dst_ptr) {
+  if (src_stride == src_width && dst_stride == dst_width) {
+    // All contiguous, so can use REALLY fast path.
+    memcpy(dst_ptr, src_ptr, src_width * src_height);
+  } else {
+    // Not all contiguous; must copy scanlines individually
+    const uint16* src = src_ptr;
+    uint16* dst = dst_ptr;
+    int i;
+    for (i = 0; i < src_height; ++i) {
+      memcpy(dst, src, src_width);
+      dst += dst_stride;
+      src += src_stride;
+    }
+  }
+}
+
+static void ScalePlane16(const uint16* src, int src_stride,
+                         int src_width, int src_height,
+                         uint16* dst, int dst_stride,
+                         int dst_width, int dst_height,
+                         FilterMode filtering, int use_ref) {
+  if (dst_width == src_width && dst_height == src_height) {
+    // Straight copy.
+    CopyPlane16(src_width, src_height, dst_width, dst_height, src_stride,
+                dst_stride, src, dst);
+  } else if (dst_width <= src_width && dst_height <= src_height) {
+    // Scale down.
+    if (use_ref) {
+      // For testing, allow the optimized versions to be disabled.
+      ScalePlaneDown16(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src, dst, filtering);
+#if 0
+    } else if (4 * dst_width == 3 * src_width &&
+               4 * dst_height == 3 * src_height) {
+      // optimized, 3/4
+      ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src, dst, filtering);
+    } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+      // optimized, 1/2
+      ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst, filtering);
+    // 3/8 rounded up for odd sized chroma height.
+    } else if (8 * dst_width == 3 * src_width &&
+               dst_height == ((src_height * 3 + 7) / 8)) {
+      // optimized, 3/8
+      ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src, dst, filtering);
+    } else if (4 * dst_width == src_width && 4 * dst_height == src_height) {
+      // optimized, 1/4
+      ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst, filtering);
+    } else if (8 * dst_width == src_width && 8 * dst_height == src_height) {
+      // optimized, 1/8
+      ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst, filtering);
+#endif
+    } else {
+      // Arbitrary downsample
+      ScalePlaneDown16(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src, dst, filtering);
+    }
+  } else {
+    // Arbitrary scale up and/or down.
+    ScalePlaneAnySize16(src_width, src_height, dst_width, dst_height,
+                        src_stride, dst_stride, src, dst, filtering);
+  }
+}
+
+/**
+ * Scale a plane.
+ *
+ * This function in turn calls a scaling function
+ * suitable for handling the desired resolutions.
+ *
+ */
+
+int I42016Scale(const uint16* src_y, int src_stride_y,
+                const uint16* src_u, int src_stride_u,
+                const uint16* src_v, int src_stride_v,
+                int src_width, int src_height,
+                uint16* dst_y, int dst_stride_y,
+                uint16* dst_u, int dst_stride_u,
+                uint16* dst_v, int dst_stride_v,
+                int dst_width, int dst_height,
+                FilterMode filtering) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    int halfheight;
+    src_height = -src_height;
+    halfheight = (src_height + 1) >> 1;
+    src_y = src_y + (src_height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+  {
+  int src_halfwidth = (src_width + 1) >> 1;
+  int src_halfheight = (src_height + 1) >> 1;
+  int dst_halfwidth = (dst_width + 1) >> 1;
+  int dst_halfheight = (dst_height + 1) >> 1;
+
+  ScalePlane16(src_y, src_stride_y, src_width, src_height,
+               dst_y, dst_stride_y, dst_width, dst_height,
+               filtering, use_reference_impl_);
+  ScalePlane16(src_u, src_stride_u, src_halfwidth, src_halfheight,
+               dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+               filtering, use_reference_impl_);
+  ScalePlane16(src_v, src_stride_v, src_halfwidth, src_halfheight,
+               dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+               filtering, use_reference_impl_);
+  }
+  return 0;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv