diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index fd9afd2ac..b1eba55c5 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -456,9 +456,8 @@ $vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6; $vp8_short_fdct8x4_neon_asm=vp8_short_fdct8x4_neon; add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch"; -specialize qw/vp8_short_walsh4x4 sse2 media neon_asm/; +specialize qw/vp8_short_walsh4x4 sse2 media neon/; $vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6; -$vp8_short_walsh4x4_neon_asm=vp8_short_walsh4x4_neon; # # Quantizer diff --git a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm deleted file mode 100644 index 22266297a..000000000 --- a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm +++ /dev/null @@ -1,103 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_short_walsh4x4_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch) -; r0 short *input, -; r1 short *output, -; r2 int pitch -|vp8_short_walsh4x4_neon| PROC - - vld1.16 {d0}, [r0@64], r2 ; load input - vld1.16 {d1}, [r0@64], r2 - vld1.16 {d2}, [r0@64], r2 - vld1.16 {d3}, [r0@64] - - ;First for-loop - ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3] - vtrn.32 d0, d2 - vtrn.32 d1, d3 - - vmov.s32 q15, #3 ; add 3 to all values - - vtrn.16 d0, d1 - vtrn.16 d2, d3 - - vadd.s16 d4, d0, d2 ; ip[0] + ip[2] - vadd.s16 d5, d1, d3 ; ip[1] + ip[3] - vsub.s16 d6, d1, d3 ; ip[1] - ip[3] - vsub.s16 d7, d0, d2 ; ip[0] - ip[2] - - vshl.s16 d4, d4, #2 ; a1 = (ip[0] + ip[2]) << 2 - vshl.s16 d5, d5, #2 ; d1 = (ip[1] + ip[3]) << 2 - vshl.s16 d6, d6, #2 ; c1 = (ip[1] - ip[3]) << 2 - vceq.s16 d16, d4, #0 ; a1 == 0 - vshl.s16 d7, d7, #2 ; b1 = (ip[0] - ip[2]) << 2 - - vadd.s16 d0, d4, d5 ; a1 + d1 - vmvn d16, d16 ; a1 != 0 - vsub.s16 d3, d4, d5 ; op[3] = a1 - d1 - vadd.s16 d1, d7, d6 ; op[1] = b1 + c1 - vsub.s16 d2, d7, d6 ; op[2] = b1 - c1 - vsub.s16 d0, d0, d16 ; op[0] = a1 + d1 + (a1 != 0) - - ;Second for-loop - ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12] - vtrn.32 d1, d3 - vtrn.32 d0, d2 - vtrn.16 d2, d3 - vtrn.16 d0, d1 - - vaddl.s16 q8, d0, d2 ; a1 = ip[0]+ip[8] - vaddl.s16 q9, d1, d3 ; d1 = ip[4]+ip[12] - vsubl.s16 q10, d1, d3 ; c1 = ip[4]-ip[12] - vsubl.s16 q11, d0, d2 ; b1 = ip[0]-ip[8] - - vadd.s32 q0, q8, q9 ; a2 = a1 + d1 - vadd.s32 q1, q11, q10 ; b2 = b1 + c1 - vsub.s32 q2, q11, q10 ; c2 = b1 - c1 - vsub.s32 q3, q8, q9 ; d2 = a1 - d1 - - vclt.s32 q8, q0, #0 - vclt.s32 q9, q1, #0 - vclt.s32 q10, q2, #0 - vclt.s32 q11, q3, #0 - - ; subtract -1 (or 0) - vsub.s32 q0, q0, q8 ; a2 += a2 < 0 - vsub.s32 q1, q1, q9 ; b2 += b2 < 0 - vsub.s32 q2, q2, q10 ; c2 += c2 < 0 - vsub.s32 q3, q3, q11 ; d2 += d2 < 0 - - vadd.s32 q8, q0, q15 ; a2 + 3 - vadd.s32 q9, q1, q15 ; b2 + 3 - vadd.s32 q10, q2, q15 ; c2 + 3 - vadd.s32 q11, q3, q15 ; d2 + 3 - - ; vrshrn? would add 1 << 3-1 = 2 - vshrn.s32 d0, q8, #3 - vshrn.s32 d1, q9, #3 - vshrn.s32 d2, q10, #3 - vshrn.s32 d3, q11, #3 - - vst1.16 {q0, q1}, [r1@128] - - bx lr - - ENDP - - END diff --git a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c new file mode 100644 index 000000000..d6b67f895 --- /dev/null +++ b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +void vp8_short_walsh4x4_neon( + int16_t *input, + int16_t *output, + int pitch) { + uint16x4_t d16u16; + int16x8_t q0s16, q1s16; + int16x4_t dEmptys16, d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int32x4_t qEmptys32, q0s32, q1s32, q2s32, q3s32, q8s32; + int32x4_t q9s32, q10s32, q11s32, q15s32; + uint32x4_t q8u32, q9u32, q10u32, q11u32; + int16x4x2_t v2tmp0, v2tmp1; + int32x2x2_t v2tmp2, v2tmp3; + + dEmptys16 = vdup_n_s16(0); + qEmptys32 = vdupq_n_s32(0); + q15s32 = vdupq_n_s32(3); + + d0s16 = vld1_s16(input); + input += pitch/2; + d1s16 = vld1_s16(input); + input += pitch/2; + d2s16 = vld1_s16(input); + input += pitch/2; + d3s16 = vld1_s16(input); + + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), + vreinterpret_s32_s16(d2s16)); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), + vreinterpret_s32_s16(d3s16)); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 + vreinterpret_s16_s32(v2tmp3.val[0])); // d1 + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 + vreinterpret_s16_s32(v2tmp3.val[1])); // d3 + + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[0]); + d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[1]); + d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[1]); + d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[0]); + + d4s16 = vshl_n_s16(d4s16, 2); + d5s16 = vshl_n_s16(d5s16, 2); + d6s16 = vshl_n_s16(d6s16, 2); + d7s16 = vshl_n_s16(d7s16, 2); + + d16u16 = vceq_s16(d4s16, dEmptys16); + d16u16 = vmvn_u16(d16u16); + + d0s16 = vadd_s16(d4s16, d5s16); + d3s16 = vsub_s16(d4s16, d5s16); + d1s16 = vadd_s16(d7s16, d6s16); + d2s16 = vsub_s16(d7s16, d6s16); + + d0s16 = vsub_s16(d0s16, vreinterpret_s16_u16(d16u16)); + + // Second for-loop + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d1s16), + vreinterpret_s32_s16(d3s16)); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d0s16), + vreinterpret_s32_s16(d2s16)); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[1]), // d2 + vreinterpret_s16_s32(v2tmp2.val[1])); // d3 + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[0]), // d0 + vreinterpret_s16_s32(v2tmp2.val[0])); // d1 + + q8s32 = vaddl_s16(v2tmp1.val[0], v2tmp0.val[0]); + q9s32 = vaddl_s16(v2tmp1.val[1], v2tmp0.val[1]); + q10s32 = vsubl_s16(v2tmp1.val[1], v2tmp0.val[1]); + q11s32 = vsubl_s16(v2tmp1.val[0], v2tmp0.val[0]); + + q0s32 = vaddq_s32(q8s32, q9s32); + q1s32 = vaddq_s32(q11s32, q10s32); + q2s32 = vsubq_s32(q11s32, q10s32); + q3s32 = vsubq_s32(q8s32, q9s32); + + q8u32 = vcltq_s32(q0s32, qEmptys32); + q9u32 = vcltq_s32(q1s32, qEmptys32); + q10u32 = vcltq_s32(q2s32, qEmptys32); + q11u32 = vcltq_s32(q3s32, qEmptys32); + + q8s32 = vreinterpretq_s32_u32(q8u32); + q9s32 = vreinterpretq_s32_u32(q9u32); + q10s32 = vreinterpretq_s32_u32(q10u32); + q11s32 = vreinterpretq_s32_u32(q11u32); + + q0s32 = vsubq_s32(q0s32, q8s32); + q1s32 = vsubq_s32(q1s32, q9s32); + q2s32 = vsubq_s32(q2s32, q10s32); + q3s32 = vsubq_s32(q3s32, q11s32); + + q8s32 = vaddq_s32(q0s32, q15s32); + q9s32 = vaddq_s32(q1s32, q15s32); + q10s32 = vaddq_s32(q2s32, q15s32); + q11s32 = vaddq_s32(q3s32, q15s32); + + d0s16 = vshrn_n_s32(q8s32, 3); + d1s16 = vshrn_n_s32(q9s32, 3); + d2s16 = vshrn_n_s32(q10s32, 3); + d3s16 = vshrn_n_s32(q11s32, 3); + + q0s16 = vcombine_s16(d0s16, d1s16); + q1s16 = vcombine_s16(d2s16, d3s16); + + vst1q_s16(output, q0s16); + vst1q_s16(output + 8, q1s16); + return; +} diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk index 573304863..0d01091c8 100644 --- a/vp8/vp8cx_arm.mk +++ b/vp8/vp8cx_arm.mk @@ -41,6 +41,6 @@ VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/shortfdct_neon$(ASM) VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/subtract_neon$(ASM) VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/vp8_mse16x16_neon$(ASM) VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/vp8_memcpy_neon$(ASM) -VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM) VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c +VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c