From a37a7b00d5416a25ed33b2a3e83c85b4aeda43cc Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Wed, 28 Jun 2017 05:09:14 -0700 Subject: [PATCH] wasm: Add DC16*, VE16, and HE16 functions BUG=webp:352 Change-Id: Ia003257d00c2c2ea16a6e6344671237e78c0eac6 --- src/dsp/dec_wasm.c | 112 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/src/dsp/dec_wasm.c b/src/dsp/dec_wasm.c index 57879136..e13e7446 100644 --- a/src/dsp/dec_wasm.c +++ b/src/dsp/dec_wasm.c @@ -9,17 +9,129 @@ // // WebAssembly (WASM) version of some decoding functions. // +// Based on dec_sse2.c #include "./dsp.h" +#include "../dec/vp8i_dec.h" #if defined(WEBP_USE_WASM) +typedef int32_t int32x4 __attribute__((__vector_size__(16))); +typedef uint32_t uint32x4 __attribute__((__vector_size__(16))); +typedef int16_t int16x8 __attribute__((__vector_size__(16))); +typedef uint16_t uint16x8 __attribute__((__vector_size__(16))); +typedef int8_t int8x16 __attribute__((__vector_size__(16))); +typedef uint8_t uint8x16 __attribute__((__vector_size__(16))); + +//------------------------------------------------------------------------------ +// + +static WEBP_INLINE uint8x16 get_16_bytes(uint8_t* dst) { + uint8x16 a; + memcpy(&a, dst, 16); + return a; +} + +static WEBP_INLINE uint8x16 splat_uint8(uint32_t val) { + uint8x16 a; + a[0] = val; + a = (uint8x16)__builtin_shufflevector( + a, a, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + return a; +} + +//------------------------------------------------------------------------------ +// Luma 16x16 + +static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) { + int j; + const uint8x16 values = splat_uint8(v); + for (j = 0; j < 16; ++j) { + memcpy(dst, &values, 16); + dst += BPS; + } +} + +static void VE16(uint8_t* dst) { + const uint8x16 top = get_16_bytes(dst - BPS); + int j; + for (j = 0; j < 16; ++j) { + memcpy(dst + j * BPS, &top, 16); + } +} + +static void HE16(uint8_t* dst) { // horizontal + int j; + for (j = 16; j > 0; --j) { + const uint8x16 values = splat_uint8(dst[-1]); + memcpy(dst, &values, 16); + dst += BPS; + } +} + +static WEBP_INLINE uint32_t add_horizontal_16(uint8_t* dst) { + const uint8x16 zero = (uint8x16){0}; + const uint8x16 a = get_16_bytes(dst); + const uint16x8 _a_lbw = (uint16x8)__builtin_shufflevector( + a, zero, 0, 16, 1, 16, 2, 16, 3, 16, 4, 16, 5, 16, 6, 16, 7, 16); + const uint16x8 _a_hbw = (uint16x8)__builtin_shufflevector( + a, zero, 8, 16, 9, 16, 10, 16, 11, 16, 12, 16, 13, 16, 14, 16, 15, 16); + const uint16x8 sum_a = _a_lbw + _a_hbw; + const uint16x8 sum_b = (uint16x8)__builtin_shufflevector( + sum_a, sum_a, 4, 5, 6, 7, 4, 5, 6, 7); + const uint16x8 sum_c = sum_a + sum_b; + const uint16x8 sum_d = (uint16x8)__builtin_shufflevector( + sum_c, sum_c, 2, 3, 2, 3, 2, 3, 2, 3); + const uint16x8 sum_e = sum_c + sum_d; + const uint16x8 sum_f = (uint16x8)__builtin_shufflevector( + sum_e, sum_e, 1, 1, 1, 1, 1, 1, 1, 1); + const uint16x8 sum_g = sum_e + sum_f; + return sum_g[0] & 0xffff; +} + +static void DC16(uint8_t* dst) { // DC + const uint32_t sum = add_horizontal_16(dst - BPS); + int left = 0; + int j; + for (j = 0; j < 16; ++j) { + left += dst[-1 + j * BPS]; + } + { + const int DC = sum + left + 16; + Put16(DC >> 5, dst); + } +} + +static void DC16NoTop(uint8_t* dst) { // DC with top samples not available + int DC = 8; + int j; + for (j = 0; j < 16; ++j) { + DC += dst[-1 + j * BPS]; + } + Put16(DC >> 4, dst); +} + +static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available + const int DC = 8 + add_horizontal_16(dst - BPS); + Put16(DC >> 4, dst); +} + +static void DC16NoTopLeft(uint8_t* dst) { // DC with no top and left samples + Put16(0x80, dst); +} + //------------------------------------------------------------------------------ // Entry point extern void VP8DspInitWASM(void); WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitWASM(void) { + VP8PredLuma16[0] = DC16; + VP8PredLuma16[2] = VE16; + VP8PredLuma16[3] = HE16; + VP8PredLuma16[4] = DC16NoTop; + VP8PredLuma16[5] = DC16NoLeft; + VP8PredLuma16[6] = DC16NoTopLeft; } #else // !WEBP_USE_WASM