From 8befcd008924a111c08dc58fa740e905bc0b0d5f Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 23 Nov 2016 20:48:00 -0800 Subject: [PATCH] enable vpx_idct16x16_10_add_neon in hbd builds BUG=webm:1294 Change-Id: Ibad079f25e673d4f5181961896a8a8333a51e825 --- vpx_dsp/arm/idct16x16_add_neon.asm | 43 ++++++++++++++++-------------- vpx_dsp/arm/idct16x16_add_neon.c | 39 +++++++++++++-------------- vpx_dsp/arm/idct16x16_neon.c | 6 ++--- vpx_dsp/arm/idct_neon.asm | 18 ++++++++++++- vpx_dsp/arm/idct_neon.h | 17 ++++++++++++ vpx_dsp/vpx_dsp.mk | 11 +++----- vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- 7 files changed, 83 insertions(+), 53 deletions(-) diff --git a/vpx_dsp/arm/idct16x16_add_neon.asm b/vpx_dsp/arm/idct16x16_add_neon.asm index e7a5e1012..05fce054c 100644 --- a/vpx_dsp/arm/idct16x16_add_neon.asm +++ b/vpx_dsp/arm/idct16x16_add_neon.asm @@ -18,6 +18,8 @@ AREA ||.text||, CODE, READONLY, ALIGN=2 + INCLUDE vpx_dsp/arm/idct_neon.asm.S + ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. MACRO TRANSPOSE8X8 @@ -753,9 +755,10 @@ end_idct16x16_pass2 bx lr ENDP ; |vpx_idct16x16_256_add_neon_pass2| -;void |vpx_idct16x16_10_add_neon_pass1|(const int16_t *input, int16_t *output) +;void |vpx_idct16x16_10_add_neon_pass1|(const tran_low_t *input, +; int16_t *output) ; -; r0 const int16_t *input +; r0 const tran_low_t *input ; r1 int16_t *output ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output @@ -765,14 +768,14 @@ end_idct16x16_pass2 ; TODO(hkuang): Find a better way to load the elements. ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q1,q2}, [r0]! + LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0 + LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0 + LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0 + LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0 + LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0 + LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0 + LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0 + LOAD_TRAN_LOW_TO_S16X2 d2, d3, d4, d5, r0 vmov.s16 q15, q1 ; cospi_28_64*2 = 6392 @@ -857,10 +860,10 @@ end_idct16x16_pass2 bx lr ENDP ; |vpx_idct16x16_10_add_neon_pass1| -;void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output, +;void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *output, ; int16_t *pass1_output) ; -; r0 const int16_t *src +; r0 const tran_low_t *src ; r1 int16_t *output ; r2 int16_t *pass1_output @@ -872,14 +875,14 @@ end_idct16x16_pass2 ; TODO(hkuang): Find a better way to load the elements. ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q0,q1}, [r0]! + LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0 + LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0 + LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0 + LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0 + LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0 + LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0 + LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0 + LOAD_TRAN_LOW_TO_S16X2 d0, d1, d2, d3, r0 vmov.s16 q15, q0; ; 2*cospi_30_64 = 3212 diff --git a/vpx_dsp/arm/idct16x16_add_neon.c b/vpx_dsp/arm/idct16x16_add_neon.c index 3e11159e2..d101bba41 100644 --- a/vpx_dsp/arm/idct16x16_add_neon.c +++ b/vpx_dsp/arm/idct16x16_add_neon.c @@ -10,8 +10,7 @@ #include -#include "./vpx_config.h" -#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/txfm_common.h" void vpx_idct16x16_256_add_neon_pass1(const int16_t *in, int16_t *out) { @@ -761,7 +760,7 @@ void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *out, } } -void vpx_idct16x16_10_add_neon_pass1(const int16_t *in, int16_t *out) { +void vpx_idct16x16_10_add_neon_pass1(const tran_low_t *in, int16_t *out) { int16x4_t d4s16; int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16; @@ -770,28 +769,28 @@ void vpx_idct16x16_10_add_neon_pass1(const int16_t *in, int16_t *out) { int32x4_t q10s32, q11s32, q12s32, q15s32; int16x8x2_t q0x2s16; - q0x2s16 = vld2q_s16(in); + q0x2s16 = load_tran_low_to_s16x2q(in); q8s16 = q0x2s16.val[0]; in += 16; - q0x2s16 = vld2q_s16(in); + q0x2s16 = load_tran_low_to_s16x2q(in); q9s16 = q0x2s16.val[0]; in += 16; - q0x2s16 = vld2q_s16(in); + q0x2s16 = load_tran_low_to_s16x2q(in); q10s16 = q0x2s16.val[0]; in += 16; - q0x2s16 = vld2q_s16(in); + q0x2s16 = load_tran_low_to_s16x2q(in); q11s16 = q0x2s16.val[0]; in += 16; - q0x2s16 = vld2q_s16(in); + q0x2s16 = load_tran_low_to_s16x2q(in); q12s16 = q0x2s16.val[0]; in += 16; - q0x2s16 = vld2q_s16(in); + q0x2s16 = load_tran_low_to_s16x2q(in); q13s16 = q0x2s16.val[0]; in += 16; - q0x2s16 = vld2q_s16(in); + q0x2s16 = load_tran_low_to_s16x2q(in); q14s16 = q0x2s16.val[0]; in += 16; - q0x2s16 = vld2q_s16(in); + q0x2s16 = load_tran_low_to_s16x2q(in); q15s16 = q0x2s16.val[0]; transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, @@ -859,7 +858,7 @@ void vpx_idct16x16_10_add_neon_pass1(const int16_t *in, int16_t *out) { vst1q_s16(out, q15s16); } -void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *out, +void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *out, int16_t *pass1_output) { int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; @@ -874,28 +873,28 @@ void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *out, int32x4_t q10s32, q11s32, q12s32, q13s32; int16x8x2_t q0x2s16; - q0x2s16 = vld2q_s16(src); + q0x2s16 = load_tran_low_to_s16x2q(src); q8s16 = q0x2s16.val[0]; src += 16; - q0x2s16 = vld2q_s16(src); + q0x2s16 = load_tran_low_to_s16x2q(src); q9s16 = q0x2s16.val[0]; src += 16; - q0x2s16 = vld2q_s16(src); + q0x2s16 = load_tran_low_to_s16x2q(src); q10s16 = q0x2s16.val[0]; src += 16; - q0x2s16 = vld2q_s16(src); + q0x2s16 = load_tran_low_to_s16x2q(src); q11s16 = q0x2s16.val[0]; src += 16; - q0x2s16 = vld2q_s16(src); + q0x2s16 = load_tran_low_to_s16x2q(src); q12s16 = q0x2s16.val[0]; src += 16; - q0x2s16 = vld2q_s16(src); + q0x2s16 = load_tran_low_to_s16x2q(src); q13s16 = q0x2s16.val[0]; src += 16; - q0x2s16 = vld2q_s16(src); + q0x2s16 = load_tran_low_to_s16x2q(src); q14s16 = q0x2s16.val[0]; src += 16; - q0x2s16 = vld2q_s16(src); + q0x2s16 = load_tran_low_to_s16x2q(src); q15s16 = q0x2s16.val[0]; transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, diff --git a/vpx_dsp/arm/idct16x16_neon.c b/vpx_dsp/arm/idct16x16_neon.c index 141b8d27e..8eae549bb 100644 --- a/vpx_dsp/arm/idct16x16_neon.c +++ b/vpx_dsp/arm/idct16x16_neon.c @@ -16,8 +16,8 @@ void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output, int16_t *pass1_output, int16_t skip_adding, uint8_t *dest, int dest_stride); -void vpx_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output); -void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output, +void vpx_idct16x16_10_add_neon_pass1(const tran_low_t *input, int16_t *output); +void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *output, int16_t *pass1_output); #if HAVE_NEON_ASM @@ -90,7 +90,7 @@ void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest, #endif } -void vpx_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest, +void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride) { #if HAVE_NEON_ASM int64_t store_reg[8]; diff --git a/vpx_dsp/arm/idct_neon.asm b/vpx_dsp/arm/idct_neon.asm index f39e8ddd4..5dd9bdc78 100644 --- a/vpx_dsp/arm/idct_neon.asm +++ b/vpx_dsp/arm/idct_neon.asm @@ -10,8 +10,9 @@ INCLUDE ./vpx_config.asm - ; Helper function used to load tran_low_t into int16, narrowing if + ; Helper functions used to load tran_low_t into int16, narrowing if ; necessary. + ; $dst0..3 are d registers with the pairs assumed to be contiguous in ; non-high-bitdepth builds. q0-q3 are used as temporaries in high-bitdepth. MACRO @@ -27,4 +28,19 @@ vld1.s16 {$dst0-$dst1,$dst2-$dst3}, [$src]! ENDIF MEND + + ; $dst0..3 are d registers. q0-q3 are used as temporaries in high-bitdepth. + MACRO + LOAD_TRAN_LOW_TO_S16X2 $dst0, $dst1, $dst2, $dst3, $src + IF CONFIG_VP9_HIGHBITDEPTH + vld2.s32 {q0,q1}, [$src]! + vld2.s32 {q2,q3}, [$src]! + vmovn.i32 $dst0, q0 + vmovn.i32 $dst1, q2 + vmovn.i32 $dst2, q1 + vmovn.i32 $dst3, q3 + ELSE + vld2.s16 {$dst0,$dst1,$dst2,$dst3}, [$src]! + ENDIF + MEND END diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h index e4493a105..34758a83b 100644 --- a/vpx_dsp/arm/idct_neon.h +++ b/vpx_dsp/arm/idct_neon.h @@ -20,6 +20,23 @@ //------------------------------------------------------------------------------ // Helper functions used to load tran_low_t into int16, narrowing if necessary. +static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4x2_t v0 = vld2q_s32(buf); + const int32x4x2_t v1 = vld2q_s32(buf + 8); + const int16x4_t s0 = vmovn_s32(v0.val[0]); + const int16x4_t s1 = vmovn_s32(v0.val[1]); + const int16x4_t s2 = vmovn_s32(v1.val[0]); + const int16x4_t s3 = vmovn_s32(v1.val[1]); + int16x8x2_t res; + res.val[0] = vcombine_s16(s0, s2); + res.val[1] = vcombine_s16(s1, s3); + return res; +#else + return vld2q_s16(buf); +#endif +} + static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) { #if CONFIG_VP9_HIGHBITDEPTH const int32x4_t v0 = vld1q_s32(buf); diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 200ef07f1..a58dcd6b8 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -204,14 +204,6 @@ endif # ARCH_X86_64 DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM) ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -ifeq ($(HAVE_NEON_ASM),yes) -DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM) -else -ifeq ($(HAVE_NEON),yes) -DSP_SRCS-yes += arm/idct16x16_add_neon.c -endif # HAVE_NEON -endif # HAVE_NEON_ASM -DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_add_neon.c DSP_SRCS-$(HAVE_MSA) += mips/inv_txfm_msa.h @@ -235,14 +227,17 @@ DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM) DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM) DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM) DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM) else DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_add_neon.c endif # HAVE_NEON_ASM DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h +DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_135_add_neon.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index d78a35757..d2d797bdf 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -689,7 +689,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_idct16x16_256_add sse2/; add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct16x16_10_add sse2/; + specialize qw/vpx_idct16x16_10_add neon sse2/; add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct16x16_1_add neon sse2/;