Merge pull request #1199 from dongzha/AddResArm64master
Add Unit Test and ARM64 code for block_add (for master)
This commit is contained in:
commit
123b904f77
@ -30,6 +30,7 @@
|
||||
4CE4469F18BC5EAB0017DF25 /* welsDecoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */; };
|
||||
4CE447AC18BC6BE90017DF25 /* block_add_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A718BC6BE90017DF25 /* block_add_neon.S */; };
|
||||
4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */; };
|
||||
6C749B6A197CC6E600A111F9 /* block_add_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 6C749B69197CC6E600A111F9 /* block_add_aarch64_neon.S */; };
|
||||
9ABF4382193EB60900A6BD61 /* expand_pic.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9ABF4381193EB60900A6BD61 /* expand_pic.cpp */; };
|
||||
9AED66561946A1DE009A3567 /* welsCodecTrace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED66551946A1DE009A3567 /* welsCodecTrace.cpp */; };
|
||||
9AED66591946A203009A3567 /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED66581946A203009A3567 /* utils.cpp */; };
|
||||
@ -108,6 +109,7 @@
|
||||
4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsDecoderExt.cpp; sourceTree = "<group>"; };
|
||||
4CE447A718BC6BE90017DF25 /* block_add_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = block_add_neon.S; sourceTree = "<group>"; };
|
||||
4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = "<group>"; };
|
||||
6C749B69197CC6E600A111F9 /* block_add_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = block_add_aarch64_neon.S; path = arm64/block_add_aarch64_neon.S; sourceTree = "<group>"; };
|
||||
9ABF4380193EB5F700A6BD61 /* expand_pic.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = expand_pic.h; path = ../../../common/inc/expand_pic.h; sourceTree = "<group>"; };
|
||||
9ABF4381193EB60900A6BD61 /* expand_pic.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = expand_pic.cpp; path = ../../../common/src/expand_pic.cpp; sourceTree = "<group>"; };
|
||||
9AED66551946A1DE009A3567 /* welsCodecTrace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = welsCodecTrace.cpp; path = ../../../common/src/welsCodecTrace.cpp; sourceTree = "<group>"; };
|
||||
@ -133,6 +135,7 @@
|
||||
4CBC1B7F194AC4A400214D9E /* arm64 */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
6C749B69197CC6E600A111F9 /* block_add_aarch64_neon.S */,
|
||||
4CBC1B80194AC4E100214D9E /* intra_pred_aarch64_neon.S */,
|
||||
);
|
||||
name = arm64;
|
||||
@ -353,6 +356,7 @@
|
||||
4CE4468A18BC5EAB0017DF25 /* au_parser.cpp in Sources */,
|
||||
4CE4469918BC5EAB0017DF25 /* mv_pred.cpp in Sources */,
|
||||
4CE447AC18BC6BE90017DF25 /* block_add_neon.S in Sources */,
|
||||
6C749B6A197CC6E600A111F9 /* block_add_aarch64_neon.S in Sources */,
|
||||
4CE4469418BC5EAB0017DF25 /* get_intra_predictor.cpp in Sources */,
|
||||
9AED66561946A1DE009A3567 /* welsCodecTrace.cpp in Sources */,
|
||||
F0B204FC18FD23D8005DA23F /* error_concealment.cpp in Sources */,
|
||||
|
161
codec/decoder/core/arm64/block_add_aarch64_neon.S
Normal file
161
codec/decoder/core/arm64/block_add_aarch64_neon.S
Normal file
@ -0,0 +1,161 @@
|
||||
/*!
|
||||
* \copy
|
||||
* Copyright (c) 2013, Cisco Systems
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef HAVE_NEON_AARCH64
|
||||
.text
|
||||
#include "arm_arch64_common_macro.S"
|
||||
#ifdef __APPLE__
|
||||
.macro ROW_TRANSFORM_1_STEP
|
||||
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
|
||||
|
||||
saddl $4.4s, $0.4h, $2.4h //int32 e[i][0] = src[0] + src[2];
|
||||
ssubl $5.4s, $0.4h, $2.4h //int32 e[i][1] = src[0] - src[2];
|
||||
sshr $8.4h, $1.4h, #1
|
||||
sshr $9.4h, $3.4h, #1
|
||||
ssubl $6.4s, $8.4h, $3.4h //int32 e[i][2] = (src[1]>>1)-src[3];
|
||||
saddl $7.4s, $1.4h, $9.4h //int32 e[i][3] = src[1] + (src[3]>>1);
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro TRANSFORM_4BYTES // both row & col transform used
|
||||
// { // output: f_q[0]~[3], input: e_q[0]~[3];
|
||||
add $0.4s, $4.4s, $7.4s //int16 f[i][0] = e[i][0] + e[i][3];
|
||||
add $1.4s, $5.4s, $6.4s //int16 f[i][1] = e[i][1] + e[i][2];
|
||||
sub $2.4s, $5.4s, $6.4s //int16 f[i][2] = e[i][1] - e[i][2];
|
||||
sub $3.4s, $4.4s, $7.4s //int16 f[i][3] = e[i][0] - e[i][3];
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro COL_TRANSFORM_1_STEP
|
||||
// { // input: src_q[0]~[3], output: e_q[0]~[3];
|
||||
add $4.4s, $0.4s, $2.4s //int32 e[0][j] = f[0][j] + f[2][j];
|
||||
sub $5.4s, $0.4s, $2.4s //int32 e[1][j] = f[0][j] - f[2][j];
|
||||
sshr $6.4s, $1.4s, #1
|
||||
sshr $7.4s, $3.4s, #1
|
||||
sub $6.4s, $6.4s, $3.4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
|
||||
add $7.4s, $1.4s, $7.4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
|
||||
// }
|
||||
.endm
|
||||
|
||||
#else
|
||||
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
|
||||
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8\() \arg9\()
|
||||
|
||||
saddl \arg4\().4s, \arg0\().4h, \arg2\().4h //int32 e[i][0] = src[0] + src[2];
|
||||
ssubl \arg5\().4s, \arg0\().4h, \arg2\().4h //int32 e[i][1] = src[0] - src[2];
|
||||
sshr \arg8\().4h, \arg1\().4h, #1
|
||||
sshr \arg9\().4h, \arg3\().4h, #1
|
||||
ssubl \arg6\().4s, \arg8\().4h, \arg3\().4h //int32 e[i][2] = (src[1]>>1)-src[3];
|
||||
saddl \arg7\().4s, \arg1\().4h, \arg9\().4h //int32 e[i][3] = src[1] + (src[3]>>1);
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
// both row & col transform used
|
||||
// { // output: f_q[0]~[3], input: e_q[0]~[3];
|
||||
add \arg0\().4s, \arg4\().4s, \arg7\().4s //int16 f[i][0] = e[i][0] + e[i][3];
|
||||
add \arg1\().4s, \arg5\().4s, \arg6\().4s //int16 f[i][1] = e[i][1] + e[i][2];
|
||||
sub \arg2\().4s, \arg5\().4s, \arg6\().4s //int16 f[i][2] = e[i][1] - e[i][2];
|
||||
sub \arg3\().4s, \arg4\().4s, \arg7\().4s //int16 f[i][3] = e[i][0] - e[i][3];
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
// { // input: src_q[0]~[3], output: e_q[0]~[3];
|
||||
add \arg4\().4s, \arg0\().4s, \arg2\().4s //int32 e[0][j] = f[0][j] + f[2][j];
|
||||
sub \arg5\().4s, \arg0\().4s, \arg2\().4s //int32 e[1][j] = f[0][j] - f[2][j];
|
||||
sshr \arg6\().4s, \arg1\().4s, #1
|
||||
sshr \arg7\().4s, \arg3\().4s, #1
|
||||
sub \arg6\().4s, \arg6\().4s, \arg3\().4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
|
||||
add \arg7\().4s, \arg1\().4s, \arg7\().4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
|
||||
// }
|
||||
.endm
|
||||
#endif
|
||||
// x0 int8_t* non_zero_count,
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN SetNonZeroCount_AArch64_neon
|
||||
mov x1, x0
|
||||
ld1 {v0.16b}, [x1], #16
|
||||
ld1 {v1.8b}, [x1]
|
||||
cmeq v0.16b, v0.16b, #0
|
||||
cmeq v1.8b, v1.8b, #0
|
||||
mvn v0.16b, v0.16b
|
||||
mvn v1.8b, v1.8b
|
||||
abs v0.16b, v0.16b
|
||||
abs v1.8b, v1.8b
|
||||
st1 {v0.16b}, [x0], #16
|
||||
st1 {v1.8b}, [x0]
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
// uint8_t *pred, const int32_t stride, int16_t *rs
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN IdctResAddPred_AArch64_neon
|
||||
|
||||
ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x2] // cost 3 cycles!
|
||||
ROW_TRANSFORM_1_STEP v0, v1, v2, v3, v16, v17, v18, v19, v4, v5
|
||||
TRANSFORM_4BYTES v0, v1, v2, v3, v16, v17, v18, v19
|
||||
// transform element 32bits
|
||||
trn1 v16.4s, v0.4s, v1.4s //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]
|
||||
trn2 v17.4s, v0.4s, v1.4s //[0 1 2 3]+[4 5 6 7]-->[1 5 3 7]
|
||||
trn1 v18.4s, v2.4s, v3.4s //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]
|
||||
trn2 v19.4s, v2.4s, v3.4s //[8 9 10 11]+[12 13 14 15]-->[9 13 11 15]
|
||||
trn1 v0.2d, v16.2d, v18.2d //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]
|
||||
trn2 v2.2d, v16.2d, v18.2d //[0 4 2 6]+[8 12 10 14]-->[2 6 10 14]
|
||||
trn1 v1.2d, v17.2d, v19.2d //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]
|
||||
trn2 v3.2d, v17.2d, v19.2d //[1 5 3 7]+[9 13 11 15]-->[3 7 11 15]
|
||||
COL_TRANSFORM_1_STEP v0, v1, v2, v3, v16, v17, v18, v19
|
||||
|
||||
TRANSFORM_4BYTES v0, v1, v2, v3, v16, v17, v18, v19
|
||||
//after clip_table[MAX_NEG_CROP] into [0, 255]
|
||||
mov x2, x0
|
||||
ld1 {v16.s}[0],[x0],x1
|
||||
ld1 {v16.s}[1],[x0],x1
|
||||
ld1 {v17.s}[0],[x0],x1
|
||||
ld1 {v17.s}[1],[x0]
|
||||
|
||||
rshrn v0.4h, v0.4s, #6
|
||||
rshrn2 v0.8h, v1.4s, #6
|
||||
rshrn v1.4h, v2.4s, #6
|
||||
rshrn2 v1.8h, v3.4s, #6
|
||||
|
||||
uxtl v2.8h,v16.8b
|
||||
uxtl v3.8h,v17.8b
|
||||
add v2.8h, v2.8h, v0.8h
|
||||
add v3.8h, v3.8h, v1.8h
|
||||
|
||||
sqxtun v0.8b,v2.8h
|
||||
sqxtun v1.8b,v3.8h
|
||||
|
||||
st1 {v0.s}[0],[x2],x1
|
||||
st1 {v0.s}[1],[x2],x1
|
||||
st1 {v1.s}[0],[x2],x1
|
||||
st1 {v1.s}[1],[x2]
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
#endif
|
@ -52,6 +52,11 @@ void IdctResAddPred_mmx (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
|
||||
void IdctResAddPred_neon (uint8_t* pred, const int32_t stride, int16_t* rs);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
void IdctResAddPred_AArch64_neon (uint8_t* pred, const int32_t stride, int16_t* rs);
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif//__cplusplus
|
||||
|
@ -67,6 +67,9 @@ extern "C" {
|
||||
void SetNonZeroCount_neon (int8_t* pNonZeroCount);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
void SetNonZeroCount_AArch64_neon (int8_t* pNonZeroCount);
|
||||
#endif
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif//__cplusplus
|
||||
|
@ -1063,6 +1063,12 @@ void WelsBlockFuncInit (SBlockFunc* pFunc, int32_t iCpu) {
|
||||
pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_neon;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON_AARCH64
|
||||
if (iCpu & WELS_CPU_NEON) {
|
||||
pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_AArch64_neon;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void SetNonZeroCount_c (int8_t* pNonZeroCount) {
|
||||
|
@ -690,7 +690,7 @@ void AssignFuncPointerForRec (PWelsDecoderContext pCtx) {
|
||||
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
if (pCtx->uiCpuFlag & WELS_CPU_NEON) {
|
||||
//pCtx->pIdctResAddPredFunc = IdctResAddPred_neon;
|
||||
pCtx->pIdctResAddPredFunc = IdctResAddPred_AArch64_neon;
|
||||
|
||||
pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_AArch64_neon;
|
||||
pCtx->pGetI16x16LumaPredFunc[I16_PRED_P] = WelsDecoderI16x16LumaPredPlane_AArch64_neon;
|
||||
|
@ -61,6 +61,13 @@ void InitErrorCon (PWelsDecoderContext pCtx) {
|
||||
pCtx->sCopyFunc.pCopyChromaFunc = WelsCopy8x8_neon; //aligned
|
||||
}
|
||||
#endif //HAVE_NEON
|
||||
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
if (pCtx->uiCpuFlag & WELS_CPU_NEON) {
|
||||
pCtx->sCopyFunc.pCopyLumaFunc = WelsCopy16x16_AArch64_neon; //aligned
|
||||
pCtx->sCopyFunc.pCopyChromaFunc = WelsCopy8x8_AArch64_neon; //aligned
|
||||
}
|
||||
#endif //HAVE_NEON_AARCH64
|
||||
} //TODO add more methods here
|
||||
return;
|
||||
}
|
||||
|
@ -41,6 +41,7 @@ endif
|
||||
|
||||
ifeq ($(ASM_ARCH), arm64)
|
||||
DECODER_ASM_ARM64_SRCS=\
|
||||
$(DECODER_SRCDIR)/core/arm64/block_add_aarch64_neon.S\
|
||||
$(DECODER_SRCDIR)/core/arm64/intra_pred_aarch64_neon.S\
|
||||
|
||||
DECODER_OBJS += $(DECODER_ASM_ARM64_SRCS:.S=.$(OBJ))
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include "macros.h"
|
||||
#include "decode_mb_aux.h"
|
||||
#include "../../codec/decoder/core/src/decode_slice.cpp"
|
||||
using namespace WelsDec;
|
||||
void IdctResAddPred_ref (uint8_t* pPred, const int32_t kiStride, int16_t* pRs) {
|
||||
int16_t iSrc[16];
|
||||
@ -39,6 +40,14 @@ void IdctResAddPred_ref (uint8_t* pPred, const int32_t kiStride, int16_t* pRs) {
|
||||
}
|
||||
}
|
||||
|
||||
void SetNonZeroCount_ref (int8_t* pNonZeroCount) {
|
||||
int32_t i;
|
||||
|
||||
for (i = 0; i < 24; i++) {
|
||||
pNonZeroCount[i] = !!pNonZeroCount[i];
|
||||
}
|
||||
}
|
||||
|
||||
#define GENERATE_IDCTRESADDPRED(pred) \
|
||||
TEST(DecoderDecodeMbAux, pred) {\
|
||||
const int32_t kiStride = 32;\
|
||||
@ -79,3 +88,47 @@ GENERATE_IDCTRESADDPRED (IdctResAddPred_mmx)
|
||||
#if defined(HAVE_NEON)
|
||||
GENERATE_IDCTRESADDPRED (IdctResAddPred_neon)
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
GENERATE_IDCTRESADDPRED (IdctResAddPred_AArch64_neon)
|
||||
#endif
|
||||
|
||||
#define GENERATE_SETNONZEROCOUNT(method) \
|
||||
TEST(DecoderDecodeMbAux, method) \
|
||||
{\
|
||||
int8_t iNonZeroCount[2][24];\
|
||||
for(int32_t i = 0; i < 24; i++) {\
|
||||
iNonZeroCount[0][i] = iNonZeroCount[1][i] = (rand() % 256)-128;\
|
||||
}\
|
||||
method(iNonZeroCount[0]);\
|
||||
SetNonZeroCount_ref(iNonZeroCount[1]);\
|
||||
for(int32_t i =0; i<24; i++) {\
|
||||
ASSERT_EQ (iNonZeroCount[0][i], iNonZeroCount[1][i]);\
|
||||
}\
|
||||
for(int32_t i =0; i<24; i++) {\
|
||||
iNonZeroCount[0][i] = iNonZeroCount[1][i] = -128;\
|
||||
}\
|
||||
method(iNonZeroCount[0]);\
|
||||
SetNonZeroCount_ref(iNonZeroCount[1]);\
|
||||
for(int32_t i =0; i<24; i++) {\
|
||||
ASSERT_EQ (iNonZeroCount[0][i], iNonZeroCount[1][i]);\
|
||||
}\
|
||||
for(int32_t i =0; i<24; i++) {\
|
||||
iNonZeroCount[0][i] = iNonZeroCount[1][i] = 127;\
|
||||
}\
|
||||
method(iNonZeroCount[0]);\
|
||||
SetNonZeroCount_ref(iNonZeroCount[1]);\
|
||||
for(int32_t i =0; i<24; i++) {\
|
||||
ASSERT_EQ (iNonZeroCount[0][i], iNonZeroCount[1][i]);\
|
||||
}\
|
||||
}
|
||||
|
||||
GENERATE_SETNONZEROCOUNT (SetNonZeroCount_c)
|
||||
|
||||
#if defined(HAVE_NEON)
|
||||
GENERATE_SETNONZEROCOUNT (SetNonZeroCount_neon)
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
GENERATE_SETNONZEROCOUNT (SetNonZeroCount_AArch64_neon)
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user