diff --git a/Android.mk b/Android.mk index 181d0cd5..f56bc887 100644 --- a/Android.mk +++ b/Android.mk @@ -33,6 +33,7 @@ LOCAL_SRC_FILES := \ src/dsp/enc_mips32.c \ src/dsp/enc_sse2.c \ src/dsp/lossless.c \ + src/dsp/lossless_mips32.c \ src/dsp/lossless_sse2.c \ src/dsp/upsampling.c \ src/dsp/upsampling_mips32.c \ diff --git a/Makefile.vc b/Makefile.vc index e38ded2d..7cf779e3 100644 --- a/Makefile.vc +++ b/Makefile.vc @@ -174,6 +174,7 @@ DSP_DEC_OBJS = \ $(DIROBJ)\dsp\dec_neon.obj \ $(DIROBJ)\dsp\dec_sse2.obj \ $(DIROBJ)\dsp\lossless.obj \ + $(DIROBJ)\dsp\lossless_mips32.obj \ $(DIROBJ)\dsp\lossless_neon.obj \ $(DIROBJ)\dsp\lossless_sse2.obj \ $(DIROBJ)\dsp\upsampling.obj \ diff --git a/makefile.unix b/makefile.unix index 92654be0..27b9881f 100644 --- a/makefile.unix +++ b/makefile.unix @@ -109,6 +109,7 @@ DSP_DEC_OBJS = \ src/dsp/dec_neon.o \ src/dsp/dec_sse2.o \ src/dsp/lossless.o \ + src/dsp/lossless_mips32.o \ src/dsp/lossless_neon.o \ src/dsp/lossless_sse2.o \ src/dsp/upsampling.o \ diff --git a/src/dsp/Makefile.am b/src/dsp/Makefile.am index f6d5e041..284e8f4d 100644 --- a/src/dsp/Makefile.am +++ b/src/dsp/Makefile.am @@ -17,6 +17,7 @@ COMMON_SOURCES += dec_neon.c COMMON_SOURCES += dec_sse2.c COMMON_SOURCES += dsp.h COMMON_SOURCES += lossless.c +COMMON_SOURCES += lossless_mips32.c COMMON_SOURCES += lossless_neon.c COMMON_SOURCES += lossless_sse2.c COMMON_SOURCES += lossless.h diff --git a/src/dsp/lossless.c b/src/dsp/lossless.c index ed9a815a..a91be156 100644 --- a/src/dsp/lossless.c +++ b/src/dsp/lossless.c @@ -332,7 +332,7 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = { #define APPROX_LOG_WITH_CORRECTION_MAX 65536 #define APPROX_LOG_MAX 4096 #define LOG_2_RECIPROCAL 1.44269504088896338700465094007086 -float VP8LFastSLog2Slow(int v) { +static float FastSLog2Slow(int v) { assert(v >= LOG_LOOKUP_IDX_MAX); if (v < APPROX_LOG_WITH_CORRECTION_MAX) { int log_cnt = 0; @@ -351,14 +351,14 @@ float VP8LFastSLog2Slow(int v) { // The correction factor: log(1 + d) ~ d; for very small d values, so // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v // LOG_2_RECIPROCAL ~ 23/16 - correction = (23 * (orig_v % y)) >> 4; + correction = (23 * (orig_v & (y - 1))) >> 4; return v_f * (kLog2Table[v] + log_cnt) + correction; } else { return (float)(LOG_2_RECIPROCAL * v * log((double)v)); } } -float VP8LFastLog2Slow(int v) { +static float FastLog2Slow(int v) { assert(v >= LOG_LOOKUP_IDX_MAX); if (v < APPROX_LOG_WITH_CORRECTION_MAX) { int log_cnt = 0; @@ -374,7 +374,7 @@ float VP8LFastLog2Slow(int v) { if (orig_v >= APPROX_LOG_MAX) { // Since the division is still expensive, add this correction factor only // for large values of 'v'. - const int correction = (23 * (orig_v % y)) >> 4; + const int correction = (23 * (orig_v & (y - 1))) >> 4; log_2 += (double)correction / orig_v; } return (float)log_2; @@ -1473,8 +1473,12 @@ VP8LConvertFunc VP8LConvertBGRAToRGBA4444; VP8LConvertFunc VP8LConvertBGRAToRGB565; VP8LConvertFunc VP8LConvertBGRAToBGR; +VP8LFastLog2SlowFunc VP8LFastLog2Slow; +VP8LFastLog2SlowFunc VP8LFastSLog2Slow; + extern void VP8LDspInitSSE2(void); extern void VP8LDspInitNEON(void); +extern void VP8LDspInitMIPS32(void); void VP8LDspInit(void) { memcpy(VP8LPredictors, kPredictorsC, sizeof(VP8LPredictors)); @@ -1491,6 +1495,9 @@ void VP8LDspInit(void) { VP8LConvertBGRAToRGB565 = VP8LConvertBGRAToRGB565_C; VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C; + VP8LFastLog2Slow = FastLog2Slow; + VP8LFastSLog2Slow = FastSLog2Slow; + // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { #if defined(WEBP_USE_SSE2) @@ -1502,6 +1509,11 @@ void VP8LDspInit(void) { if (VP8GetCPUInfo(kNEON)) { VP8LDspInitNEON(); } +#endif +#if defined(WEBP_USE_MIPS32) + if (VP8GetCPUInfo(kMIPS32)) { + VP8LDspInitMIPS32(); + } #endif } } diff --git a/src/dsp/lossless.h b/src/dsp/lossless.h index 553793c8..5967b28b 100644 --- a/src/dsp/lossless.h +++ b/src/dsp/lossless.h @@ -122,8 +122,11 @@ static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size, #define LOG_LOOKUP_IDX_MAX 256 extern const float kLog2Table[LOG_LOOKUP_IDX_MAX]; extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX]; -float VP8LFastLog2Slow(int v); -float VP8LFastSLog2Slow(int v); +typedef float (*VP8LFastLog2SlowFunc)(int v); + +extern VP8LFastLog2SlowFunc VP8LFastLog2Slow; +extern VP8LFastLog2SlowFunc VP8LFastSLog2Slow; + static WEBP_INLINE float VP8LFastLog2(int v) { return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v); } diff --git a/src/dsp/lossless_mips32.c b/src/dsp/lossless_mips32.c new file mode 100644 index 00000000..673bea77 --- /dev/null +++ b/src/dsp/lossless_mips32.c @@ -0,0 +1,108 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// MIPS version of lossless functions +// +// Author(s): Jovan Zelincevic (jovan.zelincevic@imgtec.com) + +#include "./dsp.h" +#include "./lossless.h" + +#if defined(WEBP_USE_MIPS32) + +#include +#include +#include + +#define APPROX_LOG_WITH_CORRECTION_MAX 65536 +#define APPROX_LOG_MAX 4096 +#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086 + +static float FastSLog2SlowMIPS32(int v) { + assert(v >= LOG_LOOKUP_IDX_MAX); + if (v < APPROX_LOG_WITH_CORRECTION_MAX) { + int log_cnt, y, correction; + const int c24 = 24; + const float v_f = (float)v; + int temp; + + // Xf = 256 = 2^8 + // log_cnt is index of leading one in upper 24 bits + __asm__ volatile( + "clz %[log_cnt], %[v] \n\t" + "addiu %[y], $zero, 1 \n\t" + "subu %[log_cnt], %[c24], %[log_cnt] \n\t" + "sllv %[y], %[y], %[log_cnt] \n\t" + "srlv %[temp], %[v], %[log_cnt] \n\t" + : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y), + [temp]"=r"(temp) + : [c24]"r"(c24), [v]"r"(v) + ); + + // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256 + // Xf = floor(Xf) * (1 + (v % y) / v) + // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v) + // The correction factor: log(1 + d) ~ d; for very small d values, so + // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v + // LOG_2_RECIPROCAL ~ 23/16 + + // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1) + correction = (23 * (v & (y - 1))) >> 4; + return v_f * (kLog2Table[temp] + log_cnt) + correction; + } else { + return (float)(LOG_2_RECIPROCAL * v * log((double)v)); + } +} + +static float FastLog2SlowMIPS32(int v) { + assert(v >= LOG_LOOKUP_IDX_MAX); + if (v < APPROX_LOG_WITH_CORRECTION_MAX) { + int log_cnt, y; + const int c24 = 24; + double log_2; + int temp; + + __asm__ volatile( + "clz %[log_cnt], %[v] \n\t" + "addiu %[y], $zero, 1 \n\t" + "subu %[log_cnt], %[c24], %[log_cnt] \n\t" + "sllv %[y], %[y], %[log_cnt] \n\t" + "srlv %[temp], %[v], %[log_cnt] \n\t" + : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y), + [temp]"=r"(temp) + : [c24]"r"(c24), [v]"r"(v) + ); + + log_2 = kLog2Table[temp] + log_cnt; + if (v >= APPROX_LOG_MAX) { + // Since the division is still expensive, add this correction factor only + // for large values of 'v'. + + const int correction = (23 * (v & (y - 1))) >> 4; + log_2 += (double)correction / v; + } + return (float)log_2; + } else { + return (float)(LOG_2_RECIPROCAL * log((double)v)); + } +} + +#endif // WEBP_USE_MIPS32 + +//------------------------------------------------------------------------------ +// Entry point + +extern void VP8LDspInitMIPS32(void); + +void VP8LDspInitMIPS32(void) { +#if defined(WEBP_USE_MIPS32) + VP8LFastSLog2Slow = FastSLog2SlowMIPS32; + VP8LFastLog2Slow = FastLog2SlowMIPS32; +#endif // WEBP_USE_MIPS32 +}