From 0d1699b67f5cac045224e6f0fb15593d06230088 Mon Sep 17 00:00:00 2001 From: HFVideoMac Date: Mon, 21 Jul 2014 12:19:43 +0800 Subject: [PATCH] Add Unit Test and ARM64 code for block_add --- .../welsdec/welsdec.xcodeproj/project.pbxproj | 4 + .../core/arm64/block_add_aarch64_neon.S | 161 ++++++++++++++++++ codec/decoder/core/inc/decode_mb_aux.h | 5 + codec/decoder/core/inc/decode_slice.h | 3 + codec/decoder/core/src/decode_slice.cpp | 6 + codec/decoder/core/src/decoder.cpp | 2 +- codec/decoder/core/src/error_concealment.cpp | 7 + codec/decoder/targets.mk | 1 + test/decoder/DecUT_IdctResAddPred.cpp | 53 ++++++ 9 files changed, 241 insertions(+), 1 deletion(-) create mode 100644 codec/decoder/core/arm64/block_add_aarch64_neon.S diff --git a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj index 6b645294..c09e3bb6 100644 --- a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj +++ b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj @@ -30,6 +30,7 @@ 4CE4469F18BC5EAB0017DF25 /* welsDecoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */; }; 4CE447AC18BC6BE90017DF25 /* block_add_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A718BC6BE90017DF25 /* block_add_neon.S */; }; 4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */; }; + 6C749B6A197CC6E600A111F9 /* block_add_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 6C749B69197CC6E600A111F9 /* block_add_aarch64_neon.S */; }; 9ABF4382193EB60900A6BD61 /* expand_pic.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9ABF4381193EB60900A6BD61 /* expand_pic.cpp */; }; 9AED66561946A1DE009A3567 /* welsCodecTrace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED66551946A1DE009A3567 /* welsCodecTrace.cpp */; }; 9AED66591946A203009A3567 /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED66581946A203009A3567 /* utils.cpp */; }; @@ -108,6 +109,7 @@ 4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsDecoderExt.cpp; sourceTree = ""; }; 4CE447A718BC6BE90017DF25 /* block_add_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = block_add_neon.S; sourceTree = ""; }; 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = ""; }; + 6C749B69197CC6E600A111F9 /* block_add_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = block_add_aarch64_neon.S; path = arm64/block_add_aarch64_neon.S; sourceTree = ""; }; 9ABF4380193EB5F700A6BD61 /* expand_pic.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = expand_pic.h; path = ../../../common/inc/expand_pic.h; sourceTree = ""; }; 9ABF4381193EB60900A6BD61 /* expand_pic.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = expand_pic.cpp; path = ../../../common/src/expand_pic.cpp; sourceTree = ""; }; 9AED66551946A1DE009A3567 /* welsCodecTrace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = welsCodecTrace.cpp; path = ../../../common/src/welsCodecTrace.cpp; sourceTree = ""; }; @@ -133,6 +135,7 @@ 4CBC1B7F194AC4A400214D9E /* arm64 */ = { isa = PBXGroup; children = ( + 6C749B69197CC6E600A111F9 /* block_add_aarch64_neon.S */, 4CBC1B80194AC4E100214D9E /* intra_pred_aarch64_neon.S */, ); name = arm64; @@ -353,6 +356,7 @@ 4CE4468A18BC5EAB0017DF25 /* au_parser.cpp in Sources */, 4CE4469918BC5EAB0017DF25 /* mv_pred.cpp in Sources */, 4CE447AC18BC6BE90017DF25 /* block_add_neon.S in Sources */, + 6C749B6A197CC6E600A111F9 /* block_add_aarch64_neon.S in Sources */, 4CE4469418BC5EAB0017DF25 /* get_intra_predictor.cpp in Sources */, 9AED66561946A1DE009A3567 /* welsCodecTrace.cpp in Sources */, F0B204FC18FD23D8005DA23F /* error_concealment.cpp in Sources */, diff --git a/codec/decoder/core/arm64/block_add_aarch64_neon.S b/codec/decoder/core/arm64/block_add_aarch64_neon.S new file mode 100644 index 00000000..99ddc38f --- /dev/null +++ b/codec/decoder/core/arm64/block_add_aarch64_neon.S @@ -0,0 +1,161 @@ +/*! + * \copy + * Copyright (c) 2013, Cisco Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef HAVE_NEON_AARCH64 +.text +#include "arm_arch64_common_macro.S" +#ifdef __APPLE__ +.macro ROW_TRANSFORM_1_STEP +// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9 + + saddl $4.4s, $0.4h, $2.4h //int32 e[i][0] = src[0] + src[2]; + ssubl $5.4s, $0.4h, $2.4h //int32 e[i][1] = src[0] - src[2]; + sshr $8.4h, $1.4h, #1 + sshr $9.4h, $3.4h, #1 + ssubl $6.4s, $8.4h, $3.4h //int32 e[i][2] = (src[1]>>1)-src[3]; + saddl $7.4s, $1.4h, $9.4h //int32 e[i][3] = src[1] + (src[3]>>1); +// } +.endm + +.macro TRANSFORM_4BYTES // both row & col transform used +// { // output: f_q[0]~[3], input: e_q[0]~[3]; + add $0.4s, $4.4s, $7.4s //int16 f[i][0] = e[i][0] + e[i][3]; + add $1.4s, $5.4s, $6.4s //int16 f[i][1] = e[i][1] + e[i][2]; + sub $2.4s, $5.4s, $6.4s //int16 f[i][2] = e[i][1] - e[i][2]; + sub $3.4s, $4.4s, $7.4s //int16 f[i][3] = e[i][0] - e[i][3]; +// } +.endm + +.macro COL_TRANSFORM_1_STEP +// { // input: src_q[0]~[3], output: e_q[0]~[3]; + add $4.4s, $0.4s, $2.4s //int32 e[0][j] = f[0][j] + f[2][j]; + sub $5.4s, $0.4s, $2.4s //int32 e[1][j] = f[0][j] - f[2][j]; + sshr $6.4s, $1.4s, #1 + sshr $7.4s, $3.4s, #1 + sub $6.4s, $6.4s, $3.4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; + add $7.4s, $1.4s, $7.4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1); +// } +.endm + +#else +.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 +// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8\() \arg9\() + + saddl \arg4\().4s, \arg0\().4h, \arg2\().4h //int32 e[i][0] = src[0] + src[2]; + ssubl \arg5\().4s, \arg0\().4h, \arg2\().4h //int32 e[i][1] = src[0] - src[2]; + sshr \arg8\().4h, \arg1\().4h, #1 + sshr \arg9\().4h, \arg3\().4h, #1 + ssubl \arg6\().4s, \arg8\().4h, \arg3\().4h //int32 e[i][2] = (src[1]>>1)-src[3]; + saddl \arg7\().4s, \arg1\().4h, \arg9\().4h //int32 e[i][3] = src[1] + (src[3]>>1); +// } +.endm + +.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 +// both row & col transform used +// { // output: f_q[0]~[3], input: e_q[0]~[3]; + add \arg0\().4s, \arg4\().4s, \arg7\().4s //int16 f[i][0] = e[i][0] + e[i][3]; + add \arg1\().4s, \arg5\().4s, \arg6\().4s //int16 f[i][1] = e[i][1] + e[i][2]; + sub \arg2\().4s, \arg5\().4s, \arg6\().4s //int16 f[i][2] = e[i][1] - e[i][2]; + sub \arg3\().4s, \arg4\().4s, \arg7\().4s //int16 f[i][3] = e[i][0] - e[i][3]; +// } +.endm + +.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 +// { // input: src_q[0]~[3], output: e_q[0]~[3]; + add \arg4\().4s, \arg0\().4s, \arg2\().4s //int32 e[0][j] = f[0][j] + f[2][j]; + sub \arg5\().4s, \arg0\().4s, \arg2\().4s //int32 e[1][j] = f[0][j] - f[2][j]; + sshr \arg6\().4s, \arg1\().4s, #1 + sshr \arg7\().4s, \arg3\().4s, #1 + sub \arg6\().4s, \arg6\().4s, \arg3\().4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; + add \arg7\().4s, \arg1\().4s, \arg7\().4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1); +// } +.endm +#endif +// x0 int8_t* non_zero_count, +WELS_ASM_AARCH64_FUNC_BEGIN SetNonZeroCount_AArch64_neon + mov x1, x0 + ld1 {v0.16b}, [x1], #16 + ld1 {v1.8b}, [x1] + cmeq v0.16b, v0.16b, #0 + cmeq v1.8b, v1.8b, #0 + mvn v0.16b, v0.16b + mvn v1.8b, v1.8b + abs v0.16b, v0.16b + abs v1.8b, v1.8b + st1 {v0.16b}, [x0], #16 + st1 {v1.8b}, [x0] +WELS_ASM_AARCH64_FUNC_END + +// uint8_t *pred, const int32_t stride, int16_t *rs +WELS_ASM_AARCH64_FUNC_BEGIN IdctResAddPred_AArch64_neon + + ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x2] // cost 3 cycles! + ROW_TRANSFORM_1_STEP v0, v1, v2, v3, v16, v17, v18, v19, v4, v5 + TRANSFORM_4BYTES v0, v1, v2, v3, v16, v17, v18, v19 + // transform element 32bits + trn1 v16.4s, v0.4s, v1.4s //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6] + trn2 v17.4s, v0.4s, v1.4s //[0 1 2 3]+[4 5 6 7]-->[1 5 3 7] + trn1 v18.4s, v2.4s, v3.4s //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14] + trn2 v19.4s, v2.4s, v3.4s //[8 9 10 11]+[12 13 14 15]-->[9 13 11 15] + trn1 v0.2d, v16.2d, v18.2d //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12] + trn2 v2.2d, v16.2d, v18.2d //[0 4 2 6]+[8 12 10 14]-->[2 6 10 14] + trn1 v1.2d, v17.2d, v19.2d //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13] + trn2 v3.2d, v17.2d, v19.2d //[1 5 3 7]+[9 13 11 15]-->[3 7 11 15] + COL_TRANSFORM_1_STEP v0, v1, v2, v3, v16, v17, v18, v19 + + TRANSFORM_4BYTES v0, v1, v2, v3, v16, v17, v18, v19 + //after clip_table[MAX_NEG_CROP] into [0, 255] + mov x2, x0 + ld1 {v16.s}[0],[x0],x1 + ld1 {v16.s}[1],[x0],x1 + ld1 {v17.s}[0],[x0],x1 + ld1 {v17.s}[1],[x0] + + rshrn v0.4h, v0.4s, #6 + rshrn2 v0.8h, v1.4s, #6 + rshrn v1.4h, v2.4s, #6 + rshrn2 v1.8h, v3.4s, #6 + + uxtl v2.8h,v16.8b + uxtl v3.8h,v17.8b + add v2.8h, v2.8h, v0.8h + add v3.8h, v3.8h, v1.8h + + sqxtun v0.8b,v2.8h + sqxtun v1.8b,v3.8h + + st1 {v0.s}[0],[x2],x1 + st1 {v0.s}[1],[x2],x1 + st1 {v1.s}[0],[x2],x1 + st1 {v1.s}[1],[x2] +WELS_ASM_AARCH64_FUNC_END +#endif diff --git a/codec/decoder/core/inc/decode_mb_aux.h b/codec/decoder/core/inc/decode_mb_aux.h index 656d71fb..01ac64c2 100644 --- a/codec/decoder/core/inc/decode_mb_aux.h +++ b/codec/decoder/core/inc/decode_mb_aux.h @@ -52,6 +52,11 @@ void IdctResAddPred_mmx (uint8_t* pPred, const int32_t kiStride, int16_t* pRs); void IdctResAddPred_neon (uint8_t* pred, const int32_t stride, int16_t* rs); #endif +#if defined(HAVE_NEON_AARCH64) +void IdctResAddPred_AArch64_neon (uint8_t* pred, const int32_t stride, int16_t* rs); +#endif + + #if defined(__cplusplus) } #endif//__cplusplus diff --git a/codec/decoder/core/inc/decode_slice.h b/codec/decoder/core/inc/decode_slice.h index 3c0e4b26..f3ac39ba 100644 --- a/codec/decoder/core/inc/decode_slice.h +++ b/codec/decoder/core/inc/decode_slice.h @@ -67,6 +67,9 @@ extern "C" { void SetNonZeroCount_neon (int8_t* pNonZeroCount); #endif +#if defined(HAVE_NEON_AARCH64) +void SetNonZeroCount_AArch64_neon (int8_t* pNonZeroCount); +#endif #ifdef __cplusplus } #endif//__cplusplus diff --git a/codec/decoder/core/src/decode_slice.cpp b/codec/decoder/core/src/decode_slice.cpp index 8036d54e..ae970099 100644 --- a/codec/decoder/core/src/decode_slice.cpp +++ b/codec/decoder/core/src/decode_slice.cpp @@ -1063,6 +1063,12 @@ void WelsBlockFuncInit (SBlockFunc* pFunc, int32_t iCpu) { pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_neon; } #endif + +#ifdef HAVE_NEON_AARCH64 + if (iCpu & WELS_CPU_NEON) { + pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_AArch64_neon; + } +#endif } void SetNonZeroCount_c (int8_t* pNonZeroCount) { diff --git a/codec/decoder/core/src/decoder.cpp b/codec/decoder/core/src/decoder.cpp index d12ca6bd..56e8edd6 100644 --- a/codec/decoder/core/src/decoder.cpp +++ b/codec/decoder/core/src/decoder.cpp @@ -690,7 +690,7 @@ void AssignFuncPointerForRec (PWelsDecoderContext pCtx) { #if defined(HAVE_NEON_AARCH64) if (pCtx->uiCpuFlag & WELS_CPU_NEON) { - //pCtx->pIdctResAddPredFunc = IdctResAddPred_neon; + pCtx->pIdctResAddPredFunc = IdctResAddPred_AArch64_neon; pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_AArch64_neon; pCtx->pGetI16x16LumaPredFunc[I16_PRED_P] = WelsDecoderI16x16LumaPredPlane_AArch64_neon; diff --git a/codec/decoder/core/src/error_concealment.cpp b/codec/decoder/core/src/error_concealment.cpp index 7327982e..9a74f6e0 100644 --- a/codec/decoder/core/src/error_concealment.cpp +++ b/codec/decoder/core/src/error_concealment.cpp @@ -61,6 +61,13 @@ void InitErrorCon (PWelsDecoderContext pCtx) { pCtx->sCopyFunc.pCopyChromaFunc = WelsCopy8x8_neon; //aligned } #endif //HAVE_NEON + +#if defined(HAVE_NEON_AARCH64) + if (pCtx->uiCpuFlag & WELS_CPU_NEON) { + pCtx->sCopyFunc.pCopyLumaFunc = WelsCopy16x16_AArch64_neon; //aligned + pCtx->sCopyFunc.pCopyChromaFunc = WelsCopy8x8_AArch64_neon; //aligned + } +#endif //HAVE_NEON_AARCH64 } //TODO add more methods here return; } diff --git a/codec/decoder/targets.mk b/codec/decoder/targets.mk index 03e58673..599c3487 100644 --- a/codec/decoder/targets.mk +++ b/codec/decoder/targets.mk @@ -41,6 +41,7 @@ endif ifeq ($(ASM_ARCH), arm64) DECODER_ASM_ARM64_SRCS=\ + $(DECODER_SRCDIR)/core/arm64/block_add_aarch64_neon.S\ $(DECODER_SRCDIR)/core/arm64/intra_pred_aarch64_neon.S\ DECODER_OBJS += $(DECODER_ASM_ARM64_SRCS:.S=.$(OBJ)) diff --git a/test/decoder/DecUT_IdctResAddPred.cpp b/test/decoder/DecUT_IdctResAddPred.cpp index 3806b689..6d78bc01 100644 --- a/test/decoder/DecUT_IdctResAddPred.cpp +++ b/test/decoder/DecUT_IdctResAddPred.cpp @@ -1,6 +1,7 @@ #include #include "macros.h" #include "decode_mb_aux.h" +#include "../../codec/decoder/core/src/decode_slice.cpp" using namespace WelsDec; void IdctResAddPred_ref (uint8_t* pPred, const int32_t kiStride, int16_t* pRs) { int16_t iSrc[16]; @@ -39,6 +40,14 @@ void IdctResAddPred_ref (uint8_t* pPred, const int32_t kiStride, int16_t* pRs) { } } +void SetNonZeroCount_ref (int8_t* pNonZeroCount) { + int32_t i; + + for (i = 0; i < 24; i++) { + pNonZeroCount[i] = !!pNonZeroCount[i]; + } +} + #define GENERATE_IDCTRESADDPRED(pred) \ TEST(DecoderDecodeMbAux, pred) {\ const int32_t kiStride = 32;\ @@ -79,3 +88,47 @@ GENERATE_IDCTRESADDPRED (IdctResAddPred_mmx) #if defined(HAVE_NEON) GENERATE_IDCTRESADDPRED (IdctResAddPred_neon) #endif + +#if defined(HAVE_NEON_AARCH64) +GENERATE_IDCTRESADDPRED (IdctResAddPred_AArch64_neon) +#endif + +#define GENERATE_SETNONZEROCOUNT(method) \ +TEST(DecoderDecodeMbAux, method) \ +{\ + int8_t iNonZeroCount[2][24];\ + for(int32_t i = 0; i < 24; i++) {\ + iNonZeroCount[0][i] = iNonZeroCount[1][i] = (rand() % 256)-128;\ + }\ + method(iNonZeroCount[0]);\ + SetNonZeroCount_ref(iNonZeroCount[1]);\ + for(int32_t i =0; i<24; i++) {\ + ASSERT_EQ (iNonZeroCount[0][i], iNonZeroCount[1][i]);\ + }\ + for(int32_t i =0; i<24; i++) {\ + iNonZeroCount[0][i] = iNonZeroCount[1][i] = -128;\ + }\ + method(iNonZeroCount[0]);\ + SetNonZeroCount_ref(iNonZeroCount[1]);\ + for(int32_t i =0; i<24; i++) {\ + ASSERT_EQ (iNonZeroCount[0][i], iNonZeroCount[1][i]);\ + }\ + for(int32_t i =0; i<24; i++) {\ + iNonZeroCount[0][i] = iNonZeroCount[1][i] = 127;\ + }\ + method(iNonZeroCount[0]);\ + SetNonZeroCount_ref(iNonZeroCount[1]);\ + for(int32_t i =0; i<24; i++) {\ + ASSERT_EQ (iNonZeroCount[0][i], iNonZeroCount[1][i]);\ + }\ +} + +GENERATE_SETNONZEROCOUNT (SetNonZeroCount_c) + +#if defined(HAVE_NEON) +GENERATE_SETNONZEROCOUNT (SetNonZeroCount_neon) +#endif + +#if defined(HAVE_NEON_AARCH64) +GENERATE_SETNONZEROCOUNT (SetNonZeroCount_AArch64_neon) +#endif