Merge "VP8 encoder for ARMv8 by using NEON intrinsics 3"
This commit is contained in:
commit
a19035e8bb
@ -105,7 +105,7 @@ TEST_P(SubtractBlockTest, SimpleSubtract) {
|
||||
INSTANTIATE_TEST_CASE_P(C, SubtractBlockTest,
|
||||
::testing::Values(vp8_subtract_b_c));
|
||||
|
||||
#if HAVE_NEON_ASM
|
||||
#if HAVE_NEON
|
||||
INSTANTIATE_TEST_CASE_P(NEON, SubtractBlockTest,
|
||||
::testing::Values(vp8_subtract_b_neon));
|
||||
#endif
|
||||
|
@ -502,19 +502,16 @@ specialize qw/vp8_mbuverror mmx sse2/;
|
||||
$vp8_mbuverror_sse2=vp8_mbuverror_xmm;
|
||||
|
||||
add_proto qw/void vp8_subtract_b/, "struct block *be, struct blockd *bd, int pitch";
|
||||
specialize qw/vp8_subtract_b mmx sse2 media neon_asm/;
|
||||
specialize qw/vp8_subtract_b mmx sse2 media neon/;
|
||||
$vp8_subtract_b_media=vp8_subtract_b_armv6;
|
||||
$vp8_subtract_b_neon_asm=vp8_subtract_b_neon;
|
||||
|
||||
add_proto qw/void vp8_subtract_mby/, "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride";
|
||||
specialize qw/vp8_subtract_mby mmx sse2 media neon_asm/;
|
||||
specialize qw/vp8_subtract_mby mmx sse2 media neon/;
|
||||
$vp8_subtract_mby_media=vp8_subtract_mby_armv6;
|
||||
$vp8_subtract_mby_neon_asm=vp8_subtract_mby_neon;
|
||||
|
||||
add_proto qw/void vp8_subtract_mbuv/, "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride";
|
||||
specialize qw/vp8_subtract_mbuv mmx sse2 media neon_asm/;
|
||||
specialize qw/vp8_subtract_mbuv mmx sse2 media neon/;
|
||||
$vp8_subtract_mbuv_media=vp8_subtract_mbuv_armv6;
|
||||
$vp8_subtract_mbuv_neon_asm=vp8_subtract_mbuv_neon;
|
||||
|
||||
#
|
||||
# Motion search
|
||||
|
@ -1,205 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
EXPORT |vp8_subtract_b_neon|
|
||||
EXPORT |vp8_subtract_mby_neon|
|
||||
EXPORT |vp8_subtract_mbuv_neon|
|
||||
|
||||
INCLUDE vp8_asm_enc_offsets.asm
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
|
||||
|vp8_subtract_b_neon| PROC
|
||||
|
||||
stmfd sp!, {r4-r7}
|
||||
|
||||
ldr r3, [r0, #vp8_block_base_src]
|
||||
ldr r4, [r0, #vp8_block_src]
|
||||
ldr r5, [r0, #vp8_block_src_diff]
|
||||
ldr r3, [r3]
|
||||
ldr r6, [r0, #vp8_block_src_stride]
|
||||
add r3, r3, r4 ; src = *base_src + src
|
||||
ldr r7, [r1, #vp8_blockd_predictor]
|
||||
|
||||
vld1.8 {d0}, [r3], r6 ;load src
|
||||
vld1.8 {d1}, [r7], r2 ;load pred
|
||||
vld1.8 {d2}, [r3], r6
|
||||
vld1.8 {d3}, [r7], r2
|
||||
vld1.8 {d4}, [r3], r6
|
||||
vld1.8 {d5}, [r7], r2
|
||||
vld1.8 {d6}, [r3], r6
|
||||
vld1.8 {d7}, [r7], r2
|
||||
|
||||
vsubl.u8 q10, d0, d1
|
||||
vsubl.u8 q11, d2, d3
|
||||
vsubl.u8 q12, d4, d5
|
||||
vsubl.u8 q13, d6, d7
|
||||
|
||||
mov r2, r2, lsl #1
|
||||
|
||||
vst1.16 {d20}, [r5], r2 ;store diff
|
||||
vst1.16 {d22}, [r5], r2
|
||||
vst1.16 {d24}, [r5], r2
|
||||
vst1.16 {d26}, [r5], r2
|
||||
|
||||
ldmfd sp!, {r4-r7}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
|
||||
;==========================================
|
||||
;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride
|
||||
; unsigned char *pred, int pred_stride)
|
||||
|vp8_subtract_mby_neon| PROC
|
||||
push {r4-r7}
|
||||
vpush {d8-d15}
|
||||
|
||||
mov r12, #4
|
||||
ldr r4, [sp, #80] ; pred_stride
|
||||
mov r6, #32 ; "diff" stride x2
|
||||
add r5, r0, #16 ; second diff pointer
|
||||
|
||||
subtract_mby_loop
|
||||
vld1.8 {q0}, [r1], r2 ;load src
|
||||
vld1.8 {q1}, [r3], r4 ;load pred
|
||||
vld1.8 {q2}, [r1], r2
|
||||
vld1.8 {q3}, [r3], r4
|
||||
vld1.8 {q4}, [r1], r2
|
||||
vld1.8 {q5}, [r3], r4
|
||||
vld1.8 {q6}, [r1], r2
|
||||
vld1.8 {q7}, [r3], r4
|
||||
|
||||
vsubl.u8 q8, d0, d2
|
||||
vsubl.u8 q9, d1, d3
|
||||
vsubl.u8 q10, d4, d6
|
||||
vsubl.u8 q11, d5, d7
|
||||
vsubl.u8 q12, d8, d10
|
||||
vsubl.u8 q13, d9, d11
|
||||
vsubl.u8 q14, d12, d14
|
||||
vsubl.u8 q15, d13, d15
|
||||
|
||||
vst1.16 {q8}, [r0], r6 ;store diff
|
||||
vst1.16 {q9}, [r5], r6
|
||||
vst1.16 {q10}, [r0], r6
|
||||
vst1.16 {q11}, [r5], r6
|
||||
vst1.16 {q12}, [r0], r6
|
||||
vst1.16 {q13}, [r5], r6
|
||||
vst1.16 {q14}, [r0], r6
|
||||
vst1.16 {q15}, [r5], r6
|
||||
|
||||
subs r12, r12, #1
|
||||
bne subtract_mby_loop
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r7}
|
||||
bx lr
|
||||
ENDP
|
||||
|
||||
;=================================
|
||||
;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
|
||||
; int src_stride, unsigned char *upred,
|
||||
; unsigned char *vpred, int pred_stride)
|
||||
|
||||
|vp8_subtract_mbuv_neon| PROC
|
||||
push {r4-r7}
|
||||
vpush {d8-d15}
|
||||
|
||||
ldr r4, [sp, #80] ; upred
|
||||
ldr r5, [sp, #84] ; vpred
|
||||
ldr r6, [sp, #88] ; pred_stride
|
||||
add r0, r0, #512 ; short *udiff = diff + 256;
|
||||
mov r12, #32 ; "diff" stride x2
|
||||
add r7, r0, #16 ; second diff pointer
|
||||
|
||||
;u
|
||||
vld1.8 {d0}, [r1], r3 ;load usrc
|
||||
vld1.8 {d1}, [r4], r6 ;load upred
|
||||
vld1.8 {d2}, [r1], r3
|
||||
vld1.8 {d3}, [r4], r6
|
||||
vld1.8 {d4}, [r1], r3
|
||||
vld1.8 {d5}, [r4], r6
|
||||
vld1.8 {d6}, [r1], r3
|
||||
vld1.8 {d7}, [r4], r6
|
||||
vld1.8 {d8}, [r1], r3
|
||||
vld1.8 {d9}, [r4], r6
|
||||
vld1.8 {d10}, [r1], r3
|
||||
vld1.8 {d11}, [r4], r6
|
||||
vld1.8 {d12}, [r1], r3
|
||||
vld1.8 {d13}, [r4], r6
|
||||
vld1.8 {d14}, [r1], r3
|
||||
vld1.8 {d15}, [r4], r6
|
||||
|
||||
vsubl.u8 q8, d0, d1
|
||||
vsubl.u8 q9, d2, d3
|
||||
vsubl.u8 q10, d4, d5
|
||||
vsubl.u8 q11, d6, d7
|
||||
vsubl.u8 q12, d8, d9
|
||||
vsubl.u8 q13, d10, d11
|
||||
vsubl.u8 q14, d12, d13
|
||||
vsubl.u8 q15, d14, d15
|
||||
|
||||
vst1.16 {q8}, [r0], r12 ;store diff
|
||||
vst1.16 {q9}, [r7], r12
|
||||
vst1.16 {q10}, [r0], r12
|
||||
vst1.16 {q11}, [r7], r12
|
||||
vst1.16 {q12}, [r0], r12
|
||||
vst1.16 {q13}, [r7], r12
|
||||
vst1.16 {q14}, [r0], r12
|
||||
vst1.16 {q15}, [r7], r12
|
||||
|
||||
;v
|
||||
vld1.8 {d0}, [r2], r3 ;load vsrc
|
||||
vld1.8 {d1}, [r5], r6 ;load vpred
|
||||
vld1.8 {d2}, [r2], r3
|
||||
vld1.8 {d3}, [r5], r6
|
||||
vld1.8 {d4}, [r2], r3
|
||||
vld1.8 {d5}, [r5], r6
|
||||
vld1.8 {d6}, [r2], r3
|
||||
vld1.8 {d7}, [r5], r6
|
||||
vld1.8 {d8}, [r2], r3
|
||||
vld1.8 {d9}, [r5], r6
|
||||
vld1.8 {d10}, [r2], r3
|
||||
vld1.8 {d11}, [r5], r6
|
||||
vld1.8 {d12}, [r2], r3
|
||||
vld1.8 {d13}, [r5], r6
|
||||
vld1.8 {d14}, [r2], r3
|
||||
vld1.8 {d15}, [r5], r6
|
||||
|
||||
vsubl.u8 q8, d0, d1
|
||||
vsubl.u8 q9, d2, d3
|
||||
vsubl.u8 q10, d4, d5
|
||||
vsubl.u8 q11, d6, d7
|
||||
vsubl.u8 q12, d8, d9
|
||||
vsubl.u8 q13, d10, d11
|
||||
vsubl.u8 q14, d12, d13
|
||||
vsubl.u8 q15, d14, d15
|
||||
|
||||
vst1.16 {q8}, [r0], r12 ;store diff
|
||||
vst1.16 {q9}, [r7], r12
|
||||
vst1.16 {q10}, [r0], r12
|
||||
vst1.16 {q11}, [r7], r12
|
||||
vst1.16 {q12}, [r0], r12
|
||||
vst1.16 {q13}, [r7], r12
|
||||
vst1.16 {q14}, [r0], r12
|
||||
vst1.16 {q15}, [r7], r12
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r7}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
154
vp8/encoder/arm/neon/subtract_neon.c
Normal file
154
vp8/encoder/arm/neon/subtract_neon.c
Normal file
@ -0,0 +1,154 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "vp8/encoder/block.h"
|
||||
|
||||
void vp8_subtract_b_neon(
|
||||
BLOCK *be,
|
||||
BLOCKD *bd,
|
||||
int pitch) {
|
||||
unsigned char *src_ptr, *predictor;
|
||||
int src_stride;
|
||||
int16_t *src_diff;
|
||||
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
|
||||
uint16x8_t q10u16, q11u16, q12u16, q13u16;
|
||||
|
||||
src_ptr = *be->base_src + be->src;
|
||||
src_stride = be->src_stride;
|
||||
predictor = bd->predictor;
|
||||
|
||||
d0u8 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d2u8 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d4u8 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d6u8 = vld1_u8(src_ptr);
|
||||
|
||||
d1u8 = vld1_u8(predictor);
|
||||
predictor += pitch;
|
||||
d3u8 = vld1_u8(predictor);
|
||||
predictor += pitch;
|
||||
d5u8 = vld1_u8(predictor);
|
||||
predictor += pitch;
|
||||
d7u8 = vld1_u8(predictor);
|
||||
|
||||
q10u16 = vsubl_u8(d0u8, d1u8);
|
||||
q11u16 = vsubl_u8(d2u8, d3u8);
|
||||
q12u16 = vsubl_u8(d4u8, d5u8);
|
||||
q13u16 = vsubl_u8(d6u8, d7u8);
|
||||
|
||||
src_diff = be->src_diff;
|
||||
vst1_u16((uint16_t *)src_diff, vget_low_u16(q10u16));
|
||||
src_diff += pitch;
|
||||
vst1_u16((uint16_t *)src_diff, vget_low_u16(q11u16));
|
||||
src_diff += pitch;
|
||||
vst1_u16((uint16_t *)src_diff, vget_low_u16(q12u16));
|
||||
src_diff += pitch;
|
||||
vst1_u16((uint16_t *)src_diff, vget_low_u16(q13u16));
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_subtract_mby_neon(
|
||||
int16_t *diff,
|
||||
unsigned char *src,
|
||||
int src_stride,
|
||||
unsigned char *pred,
|
||||
int pred_stride) {
|
||||
int i;
|
||||
uint8x16_t q0u8, q1u8, q2u8, q3u8;
|
||||
uint16x8_t q8u16, q9u16, q10u16, q11u16;
|
||||
|
||||
for (i = 0; i < 8; i++) { // subtract_mby_loop
|
||||
q0u8 = vld1q_u8(src);
|
||||
src += src_stride;
|
||||
q2u8 = vld1q_u8(src);
|
||||
src += src_stride;
|
||||
q1u8 = vld1q_u8(pred);
|
||||
pred += pred_stride;
|
||||
q3u8 = vld1q_u8(pred);
|
||||
pred += pred_stride;
|
||||
|
||||
q8u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q1u8));
|
||||
q9u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q1u8));
|
||||
q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q3u8));
|
||||
q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q3u8));
|
||||
|
||||
vst1q_u16((uint16_t *)diff, q8u16);
|
||||
diff += 8;
|
||||
vst1q_u16((uint16_t *)diff, q9u16);
|
||||
diff += 8;
|
||||
vst1q_u16((uint16_t *)diff, q10u16);
|
||||
diff += 8;
|
||||
vst1q_u16((uint16_t *)diff, q11u16);
|
||||
diff += 8;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_subtract_mbuv_neon(
|
||||
int16_t *diff,
|
||||
unsigned char *usrc,
|
||||
unsigned char *vsrc,
|
||||
int src_stride,
|
||||
unsigned char *upred,
|
||||
unsigned char *vpred,
|
||||
int pred_stride) {
|
||||
int i, j;
|
||||
unsigned char *src_ptr, *pred_ptr;
|
||||
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
|
||||
uint16x8_t q8u16, q9u16, q10u16, q11u16;
|
||||
|
||||
diff += 256;
|
||||
for (i = 0; i < 2; i++) {
|
||||
if (i == 0) {
|
||||
src_ptr = usrc;
|
||||
pred_ptr = upred;
|
||||
} else if (i == 1) {
|
||||
src_ptr = vsrc;
|
||||
pred_ptr = vpred;
|
||||
}
|
||||
|
||||
for (j = 0; j < 2; j++) {
|
||||
d0u8 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d1u8 = vld1_u8(pred_ptr);
|
||||
pred_ptr += pred_stride;
|
||||
d2u8 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d3u8 = vld1_u8(pred_ptr);
|
||||
pred_ptr += pred_stride;
|
||||
d4u8 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d5u8 = vld1_u8(pred_ptr);
|
||||
pred_ptr += pred_stride;
|
||||
d6u8 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d7u8 = vld1_u8(pred_ptr);
|
||||
pred_ptr += pred_stride;
|
||||
|
||||
q8u16 = vsubl_u8(d0u8, d1u8);
|
||||
q9u16 = vsubl_u8(d2u8, d3u8);
|
||||
q10u16 = vsubl_u8(d4u8, d5u8);
|
||||
q11u16 = vsubl_u8(d6u8, d7u8);
|
||||
|
||||
vst1q_u16((uint16_t *)diff, q8u16);
|
||||
diff += 8;
|
||||
vst1q_u16((uint16_t *)diff, q9u16);
|
||||
diff += 8;
|
||||
vst1q_u16((uint16_t *)diff, q10u16);
|
||||
diff += 8;
|
||||
vst1q_u16((uint16_t *)diff, q11u16);
|
||||
diff += 8;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
@ -38,9 +38,9 @@ VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM)
|
||||
VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/fastquantizeb_neon$(ASM)
|
||||
VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/picklpf_arm.c
|
||||
VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/shortfdct_neon$(ASM)
|
||||
VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/subtract_neon$(ASM)
|
||||
VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/vp8_mse16x16_neon$(ASM)
|
||||
VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/vp8_memcpy_neon$(ASM)
|
||||
|
||||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c
|
||||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c
|
||||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c
|
||||
|
Loading…
x
Reference in New Issue
Block a user