NEON asm of vpx_lpf_{horizontal,vertical}_8_dual_neon()

Also expose the NEON intrinsics version.

BUG=webm:1261, webm:1266.

Change-Id: I8c4ae658467dcf66ebf7a75982b2ef712dbb4535
This commit is contained in:
Linfeng Zhang 2016-08-12 18:14:21 -07:00
parent 5d881770e5
commit f9efbad392
5 changed files with 94 additions and 36 deletions

View File

@ -531,16 +531,12 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vpx_lpf_vertical_8_neon, &vpx_lpf_vertical_8_c, 8),
make_tuple(&vpx_lpf_horizontal_4_neon, &vpx_lpf_horizontal_4_c, 8),
make_tuple(&vpx_lpf_vertical_4_neon, &vpx_lpf_vertical_4_c, 8)));
INSTANTIATE_TEST_CASE_P(NEON, Loop8Test9Param,
::testing::Values(
// Using #if inside the macro is unsupported on MSVS but the tests are not
// currently built for MSVS with ARM and NEON.
#if HAVE_NEON_ASM
make_tuple(&vpx_lpf_horizontal_8_dual_neon,
INSTANTIATE_TEST_CASE_P(
NEON, Loop8Test9Param,
::testing::Values(make_tuple(&vpx_lpf_horizontal_8_dual_neon,
&vpx_lpf_horizontal_8_dual_c, 8),
make_tuple(&vpx_lpf_vertical_8_dual_neon,
&vpx_lpf_vertical_8_dual_c, 8),
#endif // HAVE_NEON_ASM
make_tuple(&vpx_lpf_horizontal_4_dual_neon,
&vpx_lpf_horizontal_4_dual_c, 8),
make_tuple(&vpx_lpf_vertical_4_dual_neon,

View File

@ -9,7 +9,9 @@
;
EXPORT |vpx_lpf_horizontal_8_neon|
EXPORT |vpx_lpf_horizontal_8_dual_neon|
EXPORT |vpx_lpf_vertical_8_neon|
EXPORT |vpx_lpf_vertical_8_dual_neon|
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
@ -64,6 +66,38 @@
ENDP ; |vpx_lpf_horizontal_8_neon|
;void vpx_lpf_horizontal_8_dual_neon(uint8_t *s,
; int p,
; const uint8_t *blimit0,
; const uint8_t *limit0,
; const uint8_t *thresh0,
; const uint8_t *blimit1,
; const uint8_t *limit1,
; const uint8_t *thresh1)
; r0 uint8_t *s,
; r1 int p, /* pitch */
; r2 const uint8_t *blimit0,
; r3 const uint8_t *limit0,
; sp const uint8_t *thresh0,
; sp + 4 const uint8_t *blimit1,
; sp + 8 const uint8_t *limit1,
; sp + 12 const uint8_t *thresh1,
|vpx_lpf_horizontal_8_dual_neon| PROC
push {r0-r1, lr}
ldr lr, [sp, #12]
push {lr} ; thresh0
bl vpx_lpf_horizontal_8_neon
ldr r2, [sp, #20] ; blimit1
ldr r3, [sp, #24] ; limit1
ldr lr, [sp, #28]
str lr, [sp, #16] ; thresh1
add sp, #4
pop {r0-r1, lr}
add r0, #8 ; s + 8
b vpx_lpf_horizontal_8_neon
ENDP ; |vpx_lpf_horizontal_8_dual_neon|
; void vpx_lpf_vertical_8_neon(uint8_t *s,
; int pitch,
; const uint8_t *blimit,
@ -139,6 +173,38 @@
pop {r4-r5, pc}
ENDP ; |vpx_lpf_vertical_8_neon|
;void vpx_lpf_vertical_8_dual_neon(uint8_t *s,
; int pitch,
; const uint8_t *blimit0,
; const uint8_t *limit0,
; const uint8_t *thresh0,
; const uint8_t *blimit1,
; const uint8_t *limit1,
; const uint8_t *thresh1)
; r0 uint8_t *s,
; r1 int pitch
; r2 const uint8_t *blimit0,
; r3 const uint8_t *limit0,
; sp const uint8_t *thresh0,
; sp + 4 const uint8_t *blimit1,
; sp + 8 const uint8_t *limit1,
; sp + 12 const uint8_t *thresh1,
|vpx_lpf_vertical_8_dual_neon| PROC
push {r0-r1, lr}
ldr lr, [sp, #12]
push {lr} ; thresh0
bl vpx_lpf_vertical_8_neon
ldr r2, [sp, #20] ; blimit1
ldr r3, [sp, #24] ; limit1
ldr lr, [sp, #28]
str lr, [sp, #16] ; thresh1
add sp, #4
pop {r0-r1, lr}
add r0, r1, lsl #3 ; s + 8 * pitch
b vpx_lpf_vertical_8_neon
ENDP ; |vpx_lpf_vertical_8_dual_neon|
; void vpx_mbloop_filter_neon();
; This is a helper function for the loopfilters. The invidual functions do the
; necessary load, transpose (if necessary) and store. The function does not use

View File

@ -311,6 +311,14 @@ void vpx_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
return;
}
void vpx_lpf_horizontal_8_dual_neon(
uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
const uint8_t *limit1, const uint8_t *thresh1) {
vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
}
void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
int i;
@ -427,3 +435,11 @@ void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
}
return;
}
void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *thresh0,
const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1) {
vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
}

View File

@ -21,21 +21,3 @@ void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
}
#if HAVE_NEON_ASM
void vpx_lpf_horizontal_8_dual_neon(
uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
const uint8_t *limit1, const uint8_t *thresh1) {
vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
}
void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *thresh0,
const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1) {
vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
}
#endif // HAVE_NEON_ASM

View File

@ -514,8 +514,7 @@ add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *bl
specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/;
add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
$vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon;
specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa/;
add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/;
@ -533,8 +532,7 @@ add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *
specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;
add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
$vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;
specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa/;
add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/;