MIPS: MIPS32r1: Added optimizations for FastLog2

Functions VP8LFastLog2Slow and VP8LFastSLog2Slow also: replaced some "% y" by "& (y-1)" in the C-version (since y is a power-of-two) Change-Id: I875170384e3c333812ca42d6ce7278aecabd60f0
2014-03-24 14:47:19 +01:00 · 2014-03-24 14:47:19 +01:00 · baabf1ea3a
commit baabf1ea3a
parent 3fe0291530
7 changed files with 133 additions and 6 deletions
--- a/Android.mk
+++ b/Android.mk
@ -33,6 +33,7 @@ LOCAL_SRC_FILES := \
    src/dsp/enc_mips32.c \
    src/dsp/enc_sse2.c \
    src/dsp/lossless.c \
+    src/dsp/lossless_mips32.c \
    src/dsp/lossless_sse2.c \
    src/dsp/upsampling.c \
    src/dsp/upsampling_mips32.c \
--- a/Makefile.vc
+++ b/Makefile.vc
@ -174,6 +174,7 @@ DSP_DEC_OBJS = \
    $(DIROBJ)\dsp\dec_neon.obj \
    $(DIROBJ)\dsp\dec_sse2.obj \
    $(DIROBJ)\dsp\lossless.obj \
+    $(DIROBJ)\dsp\lossless_mips32.obj \
    $(DIROBJ)\dsp\lossless_neon.obj \
    $(DIROBJ)\dsp\lossless_sse2.obj \
    $(DIROBJ)\dsp\upsampling.obj \
--- a/makefile.unix
+++ b/makefile.unix
@ -109,6 +109,7 @@ DSP_DEC_OBJS = \
    src/dsp/dec_neon.o \
    src/dsp/dec_sse2.o \
    src/dsp/lossless.o \
+    src/dsp/lossless_mips32.o \
    src/dsp/lossless_neon.o \
    src/dsp/lossless_sse2.o \
    src/dsp/upsampling.o \
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@ -17,6 +17,7 @@ COMMON_SOURCES += dec_neon.c
 COMMON_SOURCES += dec_sse2.c
 COMMON_SOURCES += dsp.h
 COMMON_SOURCES += lossless.c
+COMMON_SOURCES += lossless_mips32.c
 COMMON_SOURCES += lossless_neon.c
 COMMON_SOURCES += lossless_sse2.c
 COMMON_SOURCES += lossless.h
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
@ -332,7 +332,7 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
 #define APPROX_LOG_WITH_CORRECTION_MAX  65536
 #define APPROX_LOG_MAX                   4096
 #define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
-float VP8LFastSLog2Slow(int v) {
+static float FastSLog2Slow(int v) {
  assert(v >= LOG_LOOKUP_IDX_MAX);
  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
    int log_cnt = 0;
@ -351,14 +351,14 @@ float VP8LFastSLog2Slow(int v) {
    // The correction factor: log(1 + d) ~ d; for very small d values, so
    // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
    // LOG_2_RECIPROCAL ~ 23/16
-    correction = (23 * (orig_v % y)) >> 4;
+    correction = (23 * (orig_v & (y - 1))) >> 4;
    return v_f * (kLog2Table[v] + log_cnt) + correction;
  } else {
    return (float)(LOG_2_RECIPROCAL * v * log((double)v));
  }
 }

-float VP8LFastLog2Slow(int v) {
+static float FastLog2Slow(int v) {
  assert(v >= LOG_LOOKUP_IDX_MAX);
  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
    int log_cnt = 0;
@ -374,7 +374,7 @@ float VP8LFastLog2Slow(int v) {
    if (orig_v >= APPROX_LOG_MAX) {
      // Since the division is still expensive, add this correction factor only
      // for large values of 'v'.
-      const int correction = (23 * (orig_v % y)) >> 4;
+      const int correction = (23 * (orig_v & (y - 1))) >> 4;
      log_2 += (double)correction / orig_v;
    }
    return (float)log_2;
@ -1473,8 +1473,12 @@ VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
 VP8LConvertFunc VP8LConvertBGRAToRGB565;
 VP8LConvertFunc VP8LConvertBGRAToBGR;

+VP8LFastLog2SlowFunc VP8LFastLog2Slow;
+VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
+
 extern void VP8LDspInitSSE2(void);
 extern void VP8LDspInitNEON(void);
+extern void VP8LDspInitMIPS32(void);

 void VP8LDspInit(void) {
  memcpy(VP8LPredictors, kPredictorsC, sizeof(VP8LPredictors));
@ -1491,6 +1495,9 @@ void VP8LDspInit(void) {
  VP8LConvertBGRAToRGB565 = VP8LConvertBGRAToRGB565_C;
  VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;

+  VP8LFastLog2Slow = FastLog2Slow;
+  VP8LFastSLog2Slow = FastSLog2Slow;
+
  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
@ -1502,6 +1509,11 @@ void VP8LDspInit(void) {
    if (VP8GetCPUInfo(kNEON)) {
      VP8LDspInitNEON();
    }
+#endif
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8LDspInitMIPS32();
+    }
 #endif
  }
 }
--- a/src/dsp/lossless.h
+++ b/src/dsp/lossless.h
@ -122,8 +122,11 @@ static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
 #define LOG_LOOKUP_IDX_MAX 256
 extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
 extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
-float VP8LFastLog2Slow(int v);
-float VP8LFastSLog2Slow(int v);
+typedef float (*VP8LFastLog2SlowFunc)(int v);
+
+extern VP8LFastLog2SlowFunc VP8LFastLog2Slow;
+extern VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
+
 static WEBP_INLINE float VP8LFastLog2(int v) {
  return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
 }
--- a/src/dsp/lossless_mips32.c
+++ b/src/dsp/lossless_mips32.c
@ -0,0 +1,108 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of lossless functions
+//
+// Author(s):  Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "./dsp.h"
+#include "./lossless.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include <math.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define APPROX_LOG_WITH_CORRECTION_MAX  65536
+#define APPROX_LOG_MAX                   4096
+#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
+
+static float FastSLog2SlowMIPS32(int v) {
+  assert(v >= LOG_LOOKUP_IDX_MAX);
+  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+    int log_cnt, y, correction;
+    const int c24 = 24;
+    const float v_f = (float)v;
+    int temp;
+
+    // Xf = 256 = 2^8
+    // log_cnt is index of leading one in upper 24 bits
+    __asm__ volatile(
+      "clz      %[log_cnt], %[v]                      \n\t"
+      "addiu    %[y],       $zero,        1           \n\t"
+      "subu     %[log_cnt], %[c24],       %[log_cnt]  \n\t"
+      "sllv     %[y],       %[y],         %[log_cnt]  \n\t"
+      "srlv     %[temp],    %[v],         %[log_cnt]  \n\t"
+      : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
+        [temp]"=r"(temp)
+      : [c24]"r"(c24), [v]"r"(v)
+    );
+
+    // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
+    // Xf = floor(Xf) * (1 + (v % y) / v)
+    // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
+    // The correction factor: log(1 + d) ~ d; for very small d values, so
+    // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
+    // LOG_2_RECIPROCAL ~ 23/16
+
+    // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1)
+    correction = (23 * (v & (y - 1))) >> 4;
+    return v_f * (kLog2Table[temp] + log_cnt) + correction;
+  } else {
+    return (float)(LOG_2_RECIPROCAL * v * log((double)v));
+  }
+}
+
+static float FastLog2SlowMIPS32(int v) {
+  assert(v >= LOG_LOOKUP_IDX_MAX);
+  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+    int log_cnt, y;
+    const int c24 = 24;
+    double log_2;
+    int temp;
+
+    __asm__ volatile(
+      "clz      %[log_cnt], %[v]                      \n\t"
+      "addiu    %[y],       $zero,        1           \n\t"
+      "subu     %[log_cnt], %[c24],       %[log_cnt]  \n\t"
+      "sllv     %[y],       %[y],         %[log_cnt]  \n\t"
+      "srlv     %[temp],    %[v],         %[log_cnt]  \n\t"
+      : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
+        [temp]"=r"(temp)
+      : [c24]"r"(c24), [v]"r"(v)
+    );
+
+    log_2 = kLog2Table[temp] + log_cnt;
+    if (v >= APPROX_LOG_MAX) {
+      // Since the division is still expensive, add this correction factor only
+      // for large values of 'v'.
+
+      const int correction = (23 * (v & (y - 1))) >> 4;
+      log_2 += (double)correction / v;
+    }
+    return (float)log_2;
+  } else {
+    return (float)(LOG_2_RECIPROCAL * log((double)v));
+  }
+}
+
+#endif  // WEBP_USE_MIPS32
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LDspInitMIPS32(void);
+
+void VP8LDspInitMIPS32(void) {
+#if defined(WEBP_USE_MIPS32)
+  VP8LFastSLog2Slow = FastSLog2SlowMIPS32;
+  VP8LFastLog2Slow = FastLog2SlowMIPS32;
+#endif  // WEBP_USE_MIPS32
+}