From 3ae25974fd4abfed344216a28d5af92fb62e3cc6 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 18 Oct 2016 12:30:43 -0700 Subject: [PATCH] idct,NEON: add a tran_low_t->s16 load adapter enable idct4x4* and idct8x8* which are compatible for 8-bit decodes in high-bitdepth mode. the adapter narrows 32-bit input to 16, whether the expansion can be avoided at all in this case remains a TODO. roughly matches sse2. BUG=webm:1294 Change-Id: I3ea94e5a2070dfd509b5de0c555aab4e1f4da036 --- vpx_dsp/arm/idct4x4_add_neon.asm | 4 +++- vpx_dsp/arm/idct4x4_add_neon.c | 5 +++-- vpx_dsp/arm/idct8x8_add_neon.asm | 18 +++++++++-------- vpx_dsp/arm/idct8x8_add_neon.c | 33 ++++++++++++++++--------------- vpx_dsp/arm/idct_neon.asm | 29 +++++++++++++++++++++++++++ vpx_dsp/arm/idct_neon.h | 34 ++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp.mk | 10 ++++++---- vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 +++--- 8 files changed, 105 insertions(+), 34 deletions(-) create mode 100644 vpx_dsp/arm/idct_neon.asm create mode 100644 vpx_dsp/arm/idct_neon.h diff --git a/vpx_dsp/arm/idct4x4_add_neon.asm b/vpx_dsp/arm/idct4x4_add_neon.asm index a4ccba993..c7c60c7ca 100644 --- a/vpx_dsp/arm/idct4x4_add_neon.asm +++ b/vpx_dsp/arm/idct4x4_add_neon.asm @@ -15,6 +15,8 @@ AREA ||.text||, CODE, READONLY, ALIGN=2 + INCLUDE vpx_dsp/arm/idct_neon.asm.s + AREA Block, CODE, READONLY ; name this block of code ;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) ; @@ -33,7 +35,7 @@ ; So, two passes of a transpose followed by a column transform. ; load the inputs into q8-q9, d16-d19 - vld1.s16 {q8,q9}, [r0]! + LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0 ; generate scalar constants ; cospi_8_64 = 15137 diff --git a/vpx_dsp/arm/idct4x4_add_neon.c b/vpx_dsp/arm/idct4x4_add_neon.c index 24b91fe48..8f669c907 100644 --- a/vpx_dsp/arm/idct4x4_add_neon.c +++ b/vpx_dsp/arm/idct4x4_add_neon.c @@ -11,6 +11,7 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/txfm_common.h" void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, @@ -28,8 +29,8 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, d26u32 = d27u32 = vdup_n_u32(0); - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); + q8s16 = load_tran_low_to_s16(input); + q9s16 = load_tran_low_to_s16(input + 8); d16s16 = vget_low_s16(q8s16); d17s16 = vget_high_s16(q8s16); diff --git a/vpx_dsp/arm/idct8x8_add_neon.asm b/vpx_dsp/arm/idct8x8_add_neon.asm index 21e75951e..8d6957403 100644 --- a/vpx_dsp/arm/idct8x8_add_neon.asm +++ b/vpx_dsp/arm/idct8x8_add_neon.asm @@ -16,6 +16,8 @@ AREA ||.text||, CODE, READONLY, ALIGN=2 + INCLUDE vpx_dsp/arm/idct_neon.asm.s + ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are ; loaded in q8-q15. The output will be stored back into q8-q15 registers. ; This macro will touch q0-q7 registers and use them as buffer during @@ -207,10 +209,10 @@ |vpx_idct8x8_64_add_neon| PROC push {r4-r9} vpush {d8-d15} - vld1.s16 {q8,q9}, [r0]! - vld1.s16 {q10,q11}, [r0]! - vld1.s16 {q12,q13}, [r0]! - vld1.s16 {q14,q15}, [r0]! + LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0 + LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0 + LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0 + LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0 ; transpose the input data TRANSPOSE8X8 @@ -312,10 +314,10 @@ |vpx_idct8x8_12_add_neon| PROC push {r4-r9} vpush {d8-d15} - vld1.s16 {q8,q9}, [r0]! - vld1.s16 {q10,q11}, [r0]! - vld1.s16 {q12,q13}, [r0]! - vld1.s16 {q14,q15}, [r0]! + LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0 + LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0 + LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0 + LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0 ; transpose the input data TRANSPOSE8X8 diff --git a/vpx_dsp/arm/idct8x8_add_neon.c b/vpx_dsp/arm/idct8x8_add_neon.c index d73feebec..159a6ec98 100644 --- a/vpx_dsp/arm/idct8x8_add_neon.c +++ b/vpx_dsp/arm/idct8x8_add_neon.c @@ -12,6 +12,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" @@ -173,14 +174,14 @@ void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; uint16x8_t q8u16, q9u16, q10u16, q11u16; - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - q10s16 = vld1q_s16(input + 16); - q11s16 = vld1q_s16(input + 24); - q12s16 = vld1q_s16(input + 32); - q13s16 = vld1q_s16(input + 40); - q14s16 = vld1q_s16(input + 48); - q15s16 = vld1q_s16(input + 56); + q8s16 = load_tran_low_to_s16(input); + q9s16 = load_tran_low_to_s16(input + 8); + q10s16 = load_tran_low_to_s16(input + 16); + q11s16 = load_tran_low_to_s16(input + 24); + q12s16 = load_tran_low_to_s16(input + 32); + q13s16 = load_tran_low_to_s16(input + 40); + q14s16 = load_tran_low_to_s16(input + 48); + q15s16 = load_tran_low_to_s16(input + 56); transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); @@ -279,14 +280,14 @@ void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, uint16x8_t q8u16, q9u16, q10u16, q11u16; int32x4_t q9s32, q10s32, q11s32, q12s32; - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - q10s16 = vld1q_s16(input + 16); - q11s16 = vld1q_s16(input + 24); - q12s16 = vld1q_s16(input + 32); - q13s16 = vld1q_s16(input + 40); - q14s16 = vld1q_s16(input + 48); - q15s16 = vld1q_s16(input + 56); + q8s16 = load_tran_low_to_s16(input); + q9s16 = load_tran_low_to_s16(input + 8); + q10s16 = load_tran_low_to_s16(input + 16); + q11s16 = load_tran_low_to_s16(input + 24); + q12s16 = load_tran_low_to_s16(input + 32); + q13s16 = load_tran_low_to_s16(input + 40); + q14s16 = load_tran_low_to_s16(input + 48); + q15s16 = load_tran_low_to_s16(input + 56); transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); diff --git a/vpx_dsp/arm/idct_neon.asm b/vpx_dsp/arm/idct_neon.asm new file mode 100644 index 000000000..a223c0b63 --- /dev/null +++ b/vpx_dsp/arm/idct_neon.asm @@ -0,0 +1,29 @@ +; +; Copyright (c) 2016 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + INCLUDE ./vpx_config.asm + + ; Helper function used to load tran_low_t into int16, narrowing if + ; necessary. + ; $dst0..3 are d registers with the pairs assumed to be contiguous in + ; non-high-bitdepth builds. q0-q3 are used as temporaries in high-bitdepth. + MACRO + LOAD_TRAN_LOW_TO_S16 $dst0, $dst1, $dst2, $dst3, $src + IF CONFIG_VP9_HIGHBITDEPTH + vld1.s32 {q0,q1}, [$src]! + vld1.s32 {q2,q3}, [$src]! + vmovn.i32 $dst0, q0 + vmovn.i32 $dst1, q1 + vmovn.i32 $dst2, q2 + vmovn.i32 $dst3, q3 + ELSE + vld1.s16 {$dst0-$dst1,$dst2-$dst3}, [$src]! + ENDIF + MEND diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h new file mode 100644 index 000000000..00be7ede8 --- /dev/null +++ b/vpx_dsp/arm/idct_neon.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_ARM_IDCT_NEON_H_ +#define VPX_DSP_ARM_IDCT_NEON_H_ + +#include + +#include "./vpx_config.h" +#include "vpx_dsp/vpx_dsp_common.h" + +//------------------------------------------------------------------------------ + +// Helper function used to load tran_low_t into int16, narrowing if necessary. +static INLINE int16x8_t load_tran_low_to_s16(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vld1q_s32(buf); + const int32x4_t v1 = vld1q_s32(buf + 4); + const int16x4_t s0 = vmovn_s32(v0); + const int16x4_t s1 = vmovn_s32(v1); + return vcombine_s16(s0, s1); +#else + return vld1q_s16(buf); +#endif +} + +#endif // VPX_DSP_ARM_IDCT_NEON_H_ diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index f008f5ce9..8c91b141f 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -204,13 +204,9 @@ DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM) ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) ifeq ($(HAVE_NEON_ASM),yes) -DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM) -DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM) DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM) else ifeq ($(HAVE_NEON),yes) -DSP_SRCS-yes += arm/idct4x4_add_neon.c -DSP_SRCS-yes += arm/idct8x8_add_neon.c DSP_SRCS-yes += arm/idct16x16_add_neon.c endif # HAVE_NEON endif # HAVE_NEON_ASM @@ -233,14 +229,20 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c endif # !CONFIG_VP9_HIGHBITDEPTH ifeq ($(HAVE_NEON_ASM),yes) +DSP_SRCS-yes += arm/idct_neon$(ASM) DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM) DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM) DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM) else DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c endif # HAVE_NEON_ASM +DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c endif # CONFIG_VP9 diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index c40e75380..7f31a6a11 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -644,16 +644,16 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; } else { add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct4x4_16_add sse2/; + specialize qw/vpx_idct4x4_16_add neon sse2/; add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct4x4_1_add neon sse2/; add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64"; + specialize qw/vpx_idct8x8_64_add neon sse2/, "$ssse3_x86_64"; add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64"; + specialize qw/vpx_idct8x8_12_add neon sse2/, "$ssse3_x86_64"; add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct8x8_1_add neon sse2/;