From 6d6fdd9c3d763c6fbcd4f79a07ed2ec131500bfd Mon Sep 17 00:00:00 2001 From: James Yu Date: Mon, 6 Jan 2014 17:56:49 +0800 Subject: [PATCH] VP8 encoder for ARMv8 by using NEON intrinsics 3 Add subtract_neon.c - vp8_subtract_b_neon - vp8_subtract_mby_neon - vp8_subtract_mbuv_neon Change-Id: If9a17a093478552e3e3276eeaa3f098b9021d08c Signed-off-by: James Yu --- test/subtract_test.cc | 2 +- vp8/common/rtcd_defs.pl | 9 +- vp8/encoder/arm/neon/subtract_neon.asm | 205 ------------------------- vp8/encoder/arm/neon/subtract_neon.c | 154 +++++++++++++++++++ vp8/vp8cx_arm.mk | 2 +- 5 files changed, 159 insertions(+), 213 deletions(-) delete mode 100644 vp8/encoder/arm/neon/subtract_neon.asm create mode 100644 vp8/encoder/arm/neon/subtract_neon.c diff --git a/test/subtract_test.cc b/test/subtract_test.cc index 6619fb158..ff42725f0 100644 --- a/test/subtract_test.cc +++ b/test/subtract_test.cc @@ -105,7 +105,7 @@ TEST_P(SubtractBlockTest, SimpleSubtract) { INSTANTIATE_TEST_CASE_P(C, SubtractBlockTest, ::testing::Values(vp8_subtract_b_c)); -#if HAVE_NEON_ASM +#if HAVE_NEON INSTANTIATE_TEST_CASE_P(NEON, SubtractBlockTest, ::testing::Values(vp8_subtract_b_neon)); #endif diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index b1eba55c5..204cbf0ee 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -502,19 +502,16 @@ specialize qw/vp8_mbuverror mmx sse2/; $vp8_mbuverror_sse2=vp8_mbuverror_xmm; add_proto qw/void vp8_subtract_b/, "struct block *be, struct blockd *bd, int pitch"; -specialize qw/vp8_subtract_b mmx sse2 media neon_asm/; +specialize qw/vp8_subtract_b mmx sse2 media neon/; $vp8_subtract_b_media=vp8_subtract_b_armv6; -$vp8_subtract_b_neon_asm=vp8_subtract_b_neon; add_proto qw/void vp8_subtract_mby/, "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride"; -specialize qw/vp8_subtract_mby mmx sse2 media neon_asm/; +specialize qw/vp8_subtract_mby mmx sse2 media neon/; $vp8_subtract_mby_media=vp8_subtract_mby_armv6; -$vp8_subtract_mby_neon_asm=vp8_subtract_mby_neon; add_proto qw/void vp8_subtract_mbuv/, "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride"; -specialize qw/vp8_subtract_mbuv mmx sse2 media neon_asm/; +specialize qw/vp8_subtract_mbuv mmx sse2 media neon/; $vp8_subtract_mbuv_media=vp8_subtract_mbuv_armv6; -$vp8_subtract_mbuv_neon_asm=vp8_subtract_mbuv_neon; # # Motion search diff --git a/vp8/encoder/arm/neon/subtract_neon.asm b/vp8/encoder/arm/neon/subtract_neon.asm deleted file mode 100644 index 840cb33d9..000000000 --- a/vp8/encoder/arm/neon/subtract_neon.asm +++ /dev/null @@ -1,205 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vp8_subtract_b_neon| - EXPORT |vp8_subtract_mby_neon| - EXPORT |vp8_subtract_mbuv_neon| - - INCLUDE vp8_asm_enc_offsets.asm - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch) -|vp8_subtract_b_neon| PROC - - stmfd sp!, {r4-r7} - - ldr r3, [r0, #vp8_block_base_src] - ldr r4, [r0, #vp8_block_src] - ldr r5, [r0, #vp8_block_src_diff] - ldr r3, [r3] - ldr r6, [r0, #vp8_block_src_stride] - add r3, r3, r4 ; src = *base_src + src - ldr r7, [r1, #vp8_blockd_predictor] - - vld1.8 {d0}, [r3], r6 ;load src - vld1.8 {d1}, [r7], r2 ;load pred - vld1.8 {d2}, [r3], r6 - vld1.8 {d3}, [r7], r2 - vld1.8 {d4}, [r3], r6 - vld1.8 {d5}, [r7], r2 - vld1.8 {d6}, [r3], r6 - vld1.8 {d7}, [r7], r2 - - vsubl.u8 q10, d0, d1 - vsubl.u8 q11, d2, d3 - vsubl.u8 q12, d4, d5 - vsubl.u8 q13, d6, d7 - - mov r2, r2, lsl #1 - - vst1.16 {d20}, [r5], r2 ;store diff - vst1.16 {d22}, [r5], r2 - vst1.16 {d24}, [r5], r2 - vst1.16 {d26}, [r5], r2 - - ldmfd sp!, {r4-r7} - bx lr - - ENDP - - -;========================================== -;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride -; unsigned char *pred, int pred_stride) -|vp8_subtract_mby_neon| PROC - push {r4-r7} - vpush {d8-d15} - - mov r12, #4 - ldr r4, [sp, #80] ; pred_stride - mov r6, #32 ; "diff" stride x2 - add r5, r0, #16 ; second diff pointer - -subtract_mby_loop - vld1.8 {q0}, [r1], r2 ;load src - vld1.8 {q1}, [r3], r4 ;load pred - vld1.8 {q2}, [r1], r2 - vld1.8 {q3}, [r3], r4 - vld1.8 {q4}, [r1], r2 - vld1.8 {q5}, [r3], r4 - vld1.8 {q6}, [r1], r2 - vld1.8 {q7}, [r3], r4 - - vsubl.u8 q8, d0, d2 - vsubl.u8 q9, d1, d3 - vsubl.u8 q10, d4, d6 - vsubl.u8 q11, d5, d7 - vsubl.u8 q12, d8, d10 - vsubl.u8 q13, d9, d11 - vsubl.u8 q14, d12, d14 - vsubl.u8 q15, d13, d15 - - vst1.16 {q8}, [r0], r6 ;store diff - vst1.16 {q9}, [r5], r6 - vst1.16 {q10}, [r0], r6 - vst1.16 {q11}, [r5], r6 - vst1.16 {q12}, [r0], r6 - vst1.16 {q13}, [r5], r6 - vst1.16 {q14}, [r0], r6 - vst1.16 {q15}, [r5], r6 - - subs r12, r12, #1 - bne subtract_mby_loop - - vpop {d8-d15} - pop {r4-r7} - bx lr - ENDP - -;================================= -;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, -; int src_stride, unsigned char *upred, -; unsigned char *vpred, int pred_stride) - -|vp8_subtract_mbuv_neon| PROC - push {r4-r7} - vpush {d8-d15} - - ldr r4, [sp, #80] ; upred - ldr r5, [sp, #84] ; vpred - ldr r6, [sp, #88] ; pred_stride - add r0, r0, #512 ; short *udiff = diff + 256; - mov r12, #32 ; "diff" stride x2 - add r7, r0, #16 ; second diff pointer - -;u - vld1.8 {d0}, [r1], r3 ;load usrc - vld1.8 {d1}, [r4], r6 ;load upred - vld1.8 {d2}, [r1], r3 - vld1.8 {d3}, [r4], r6 - vld1.8 {d4}, [r1], r3 - vld1.8 {d5}, [r4], r6 - vld1.8 {d6}, [r1], r3 - vld1.8 {d7}, [r4], r6 - vld1.8 {d8}, [r1], r3 - vld1.8 {d9}, [r4], r6 - vld1.8 {d10}, [r1], r3 - vld1.8 {d11}, [r4], r6 - vld1.8 {d12}, [r1], r3 - vld1.8 {d13}, [r4], r6 - vld1.8 {d14}, [r1], r3 - vld1.8 {d15}, [r4], r6 - - vsubl.u8 q8, d0, d1 - vsubl.u8 q9, d2, d3 - vsubl.u8 q10, d4, d5 - vsubl.u8 q11, d6, d7 - vsubl.u8 q12, d8, d9 - vsubl.u8 q13, d10, d11 - vsubl.u8 q14, d12, d13 - vsubl.u8 q15, d14, d15 - - vst1.16 {q8}, [r0], r12 ;store diff - vst1.16 {q9}, [r7], r12 - vst1.16 {q10}, [r0], r12 - vst1.16 {q11}, [r7], r12 - vst1.16 {q12}, [r0], r12 - vst1.16 {q13}, [r7], r12 - vst1.16 {q14}, [r0], r12 - vst1.16 {q15}, [r7], r12 - -;v - vld1.8 {d0}, [r2], r3 ;load vsrc - vld1.8 {d1}, [r5], r6 ;load vpred - vld1.8 {d2}, [r2], r3 - vld1.8 {d3}, [r5], r6 - vld1.8 {d4}, [r2], r3 - vld1.8 {d5}, [r5], r6 - vld1.8 {d6}, [r2], r3 - vld1.8 {d7}, [r5], r6 - vld1.8 {d8}, [r2], r3 - vld1.8 {d9}, [r5], r6 - vld1.8 {d10}, [r2], r3 - vld1.8 {d11}, [r5], r6 - vld1.8 {d12}, [r2], r3 - vld1.8 {d13}, [r5], r6 - vld1.8 {d14}, [r2], r3 - vld1.8 {d15}, [r5], r6 - - vsubl.u8 q8, d0, d1 - vsubl.u8 q9, d2, d3 - vsubl.u8 q10, d4, d5 - vsubl.u8 q11, d6, d7 - vsubl.u8 q12, d8, d9 - vsubl.u8 q13, d10, d11 - vsubl.u8 q14, d12, d13 - vsubl.u8 q15, d14, d15 - - vst1.16 {q8}, [r0], r12 ;store diff - vst1.16 {q9}, [r7], r12 - vst1.16 {q10}, [r0], r12 - vst1.16 {q11}, [r7], r12 - vst1.16 {q12}, [r0], r12 - vst1.16 {q13}, [r7], r12 - vst1.16 {q14}, [r0], r12 - vst1.16 {q15}, [r7], r12 - - vpop {d8-d15} - pop {r4-r7} - bx lr - - ENDP - - END diff --git a/vp8/encoder/arm/neon/subtract_neon.c b/vp8/encoder/arm/neon/subtract_neon.c new file mode 100644 index 000000000..d3ab7b165 --- /dev/null +++ b/vp8/encoder/arm/neon/subtract_neon.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "vp8/encoder/block.h" + +void vp8_subtract_b_neon( + BLOCK *be, + BLOCKD *bd, + int pitch) { + unsigned char *src_ptr, *predictor; + int src_stride; + int16_t *src_diff; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + uint16x8_t q10u16, q11u16, q12u16, q13u16; + + src_ptr = *be->base_src + be->src; + src_stride = be->src_stride; + predictor = bd->predictor; + + d0u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d4u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d6u8 = vld1_u8(src_ptr); + + d1u8 = vld1_u8(predictor); + predictor += pitch; + d3u8 = vld1_u8(predictor); + predictor += pitch; + d5u8 = vld1_u8(predictor); + predictor += pitch; + d7u8 = vld1_u8(predictor); + + q10u16 = vsubl_u8(d0u8, d1u8); + q11u16 = vsubl_u8(d2u8, d3u8); + q12u16 = vsubl_u8(d4u8, d5u8); + q13u16 = vsubl_u8(d6u8, d7u8); + + src_diff = be->src_diff; + vst1_u16((uint16_t *)src_diff, vget_low_u16(q10u16)); + src_diff += pitch; + vst1_u16((uint16_t *)src_diff, vget_low_u16(q11u16)); + src_diff += pitch; + vst1_u16((uint16_t *)src_diff, vget_low_u16(q12u16)); + src_diff += pitch; + vst1_u16((uint16_t *)src_diff, vget_low_u16(q13u16)); + return; +} + +void vp8_subtract_mby_neon( + int16_t *diff, + unsigned char *src, + int src_stride, + unsigned char *pred, + int pred_stride) { + int i; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + uint16x8_t q8u16, q9u16, q10u16, q11u16; + + for (i = 0; i < 8; i++) { // subtract_mby_loop + q0u8 = vld1q_u8(src); + src += src_stride; + q2u8 = vld1q_u8(src); + src += src_stride; + q1u8 = vld1q_u8(pred); + pred += pred_stride; + q3u8 = vld1q_u8(pred); + pred += pred_stride; + + q8u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q1u8)); + q9u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q1u8)); + q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q3u8)); + q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q3u8)); + + vst1q_u16((uint16_t *)diff, q8u16); + diff += 8; + vst1q_u16((uint16_t *)diff, q9u16); + diff += 8; + vst1q_u16((uint16_t *)diff, q10u16); + diff += 8; + vst1q_u16((uint16_t *)diff, q11u16); + diff += 8; + } + return; +} + +void vp8_subtract_mbuv_neon( + int16_t *diff, + unsigned char *usrc, + unsigned char *vsrc, + int src_stride, + unsigned char *upred, + unsigned char *vpred, + int pred_stride) { + int i, j; + unsigned char *src_ptr, *pred_ptr; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + uint16x8_t q8u16, q9u16, q10u16, q11u16; + + diff += 256; + for (i = 0; i < 2; i++) { + if (i == 0) { + src_ptr = usrc; + pred_ptr = upred; + } else if (i == 1) { + src_ptr = vsrc; + pred_ptr = vpred; + } + + for (j = 0; j < 2; j++) { + d0u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d1u8 = vld1_u8(pred_ptr); + pred_ptr += pred_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d3u8 = vld1_u8(pred_ptr); + pred_ptr += pred_stride; + d4u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d5u8 = vld1_u8(pred_ptr); + pred_ptr += pred_stride; + d6u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d7u8 = vld1_u8(pred_ptr); + pred_ptr += pred_stride; + + q8u16 = vsubl_u8(d0u8, d1u8); + q9u16 = vsubl_u8(d2u8, d3u8); + q10u16 = vsubl_u8(d4u8, d5u8); + q11u16 = vsubl_u8(d6u8, d7u8); + + vst1q_u16((uint16_t *)diff, q8u16); + diff += 8; + vst1q_u16((uint16_t *)diff, q9u16); + diff += 8; + vst1q_u16((uint16_t *)diff, q10u16); + diff += 8; + vst1q_u16((uint16_t *)diff, q11u16); + diff += 8; + } + } + return; +} diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk index 0d01091c8..0b3eed06c 100644 --- a/vp8/vp8cx_arm.mk +++ b/vp8/vp8cx_arm.mk @@ -38,9 +38,9 @@ VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM) VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/fastquantizeb_neon$(ASM) VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/picklpf_arm.c VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/shortfdct_neon$(ASM) -VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/subtract_neon$(ASM) VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/vp8_mse16x16_neon$(ASM) VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/vp8_memcpy_neon$(ASM) VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c +VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c