Add asm code for NoneZeroCount and refine related code

This commit is contained in:
zhiliang wang 2015-01-04 16:39:17 +08:00
parent 7d5e88ffda
commit 01b74ea7c1
12 changed files with 56 additions and 91 deletions

View File

@ -834,17 +834,12 @@ WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon
vld1.64 {d0-d2}, [r0]
vceq.s8 q0, q0, #0
vceq.s8 d2, d2, #0
vmvn q0, q0
vmvn d2, d2
vabs.s8 q0, q0
vabs.s8 d2, d2
vst1.64 {d0-d2}, [r0]
mov r1, #1
vdup.8 q2, r1
vld1.64 {d0,d1,d2}, [r0]
vmin.s8 q0, q0, q2
vmin.s8 d2, d2, d4
vst1.64 {d0,d1,d2}, [r0]
WELS_ASM_FUNC_END
#ifdef __APPLE__

View File

@ -553,16 +553,12 @@ bs_mv_check_jump1:
#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsNonZeroCount_AArch64_neon
mov w1, #1
dup v3.8b, w1
ld1 {v0.8b, v1.8b, v2.8b}, [x0]
ins v0.d[1], v1.d[0]
uzp1 v0.2d, v0.2d, v1.2d
cmeq v0.16b, v0.16b, #0
cmeq v2.8b, v2.8b, #0
mvn v0.16b, v0.16b
mvn v2.8b, v2.8b
abs v0.16b, v0.16b
abs v2.8b, v2.8b
ins v1.d[0], v0.d[1]
umin v0.8b, v0.8b, v3.8b
umin v1.8b, v1.8b, v3.8b
umin v2.8b, v2.8b, v3.8b
st1 {v0.8b, v1.8b, v2.8b}, [x0]
WELS_ASM_AARCH64_FUNC_END

View File

@ -15,6 +15,8 @@ void DeblockChromaLt4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int
int8_t* pTc);
void DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
void WelsNonZeroCount_c (int8_t* pNonZeroCount);
#if defined(__cplusplus)
extern "C" {
#endif//__cplusplus
@ -32,6 +34,7 @@ void DeblockChromaLt4V_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride,
void DeblockChromaEq4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
void DeblockChromaLt4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
int8_t* pTC);
void WelsNonZeroCount_sse2 (int8_t* pNonZeroCount);
#endif
#if defined(HAVE_NEON)
@ -48,6 +51,7 @@ void DeblockChromaEq4V_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride,
void DeblockChromaLt4H_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
int8_t* pTC);
void DeblockChromaEq4H_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
void WelsNonZeroCount_neon (int8_t* pNonZeroCount);
#endif
#if defined(HAVE_NEON_AARCH64)
@ -61,6 +65,7 @@ void DeblockChromaEq4V_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t i
void DeblockChromaLt4H_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
int8_t* pTC);
void DeblockChromaEq4H_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
void WelsNonZeroCount_AArch64_neon (int8_t* pNonZeroCount);
#endif
#if defined(__cplusplus)
}

View File

@ -180,6 +180,13 @@ void DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int
DeblockChromaEq4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta);
}
void WelsNonZeroCount_c (int8_t* pNonZeroCount) {
int32_t i;
for (i = 0; i < 24; i++) {
pNonZeroCount[i] = !!pNonZeroCount[i];
}
}
#ifdef X86_ASM
extern "C" {
void DeblockLumaLt4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {

View File

@ -5276,3 +5276,14 @@ WELS_EXTERN DeblockLumaTransposeV2H_sse2
pop r3
ret
WELS_EXTERN WelsNonZeroCount_sse2
%assign push_num 0
LOAD_1_PARA
movdqu xmm0, [r0]
movq xmm1, [r0+16]
WELS_DB1 xmm2
pminub xmm0, xmm2
pminub xmm1, xmm2
movdqu [r0], xmm0
movq [r0+16], xmm1
ret

View File

@ -99,18 +99,6 @@
// }
.endm
#endif
// r0 int8_t* non_zero_count,
WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon
vld1.64 {d0-d2}, [r0]
vceq.s8 q0, q0, #0
vceq.s8 d2, d2, #0
vmvn q0, q0
vmvn d2, d2
vabs.s8 q0, q0
vabs.s8 d2, d2
vst1.64 {d0-d2}, [r0]
WELS_ASM_FUNC_END
// uint8_t *pred, const int32_t stride, int16_t *rs
WELS_ASM_FUNC_BEGIN IdctResAddPred_neon

View File

@ -100,20 +100,6 @@
// }
.endm
#endif
// x0 int8_t* non_zero_count,
WELS_ASM_AARCH64_FUNC_BEGIN SetNonZeroCount_AArch64_neon
mov x1, x0
ld1 {v0.16b}, [x1], #16
ld1 {v1.8b}, [x1]
cmeq v0.16b, v0.16b, #0
cmeq v1.8b, v1.8b, #0
mvn v0.16b, v0.16b
mvn v1.8b, v1.8b
abs v0.16b, v0.16b
abs v1.8b, v1.8b
st1 {v0.16b}, [x0], #16
st1 {v1.8b}, [x0]
WELS_ASM_AARCH64_FUNC_END
// uint8_t *pred, const int32_t stride, int16_t *rs
WELS_ASM_AARCH64_FUNC_BEGIN IdctResAddPred_AArch64_neon

View File

@ -73,13 +73,11 @@ void WelsBlockZero8x8_sse2(int16_t * block, int32_t stride);
#endif
#if defined(HAVE_NEON)
void SetNonZeroCount_neon (int8_t* pNonZeroCount);
void WelsBlockZero16x16_neon(int16_t * block, int32_t stride);
void WelsBlockZero8x8_neon(int16_t * block, int32_t stride);
#endif
#if defined(HAVE_NEON_AARCH64)
void SetNonZeroCount_AArch64_neon (int8_t* pNonZeroCount);
void WelsBlockZero16x16_AArch64_neon(int16_t * block, int32_t stride);
void WelsBlockZero8x8_AArch64_neon(int16_t * block, int32_t stride);
#endif
@ -87,8 +85,6 @@ void WelsBlockZero8x8_AArch64_neon(int16_t * block, int32_t stride);
}
#endif//__cplusplus
void SetNonZeroCount_c (int8_t* pNonZeroCount);
void WelsBlockFuncInit (SBlockFunc* pFunc, int32_t iCpu);
void WelsBlockZero16x16_c(int16_t * block, int32_t stride);
void WelsBlockZero8x8_c(int16_t * block, int32_t stride);

View File

@ -1690,25 +1690,13 @@ int32_t WelsDecodeMbCavlcPSlice (PWelsDecoderContext pCtx, PNalUnit pNalCur, uin
}
void WelsBlockFuncInit (SBlockFunc* pFunc, int32_t iCpu) {
pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_c;
#ifdef HAVE_NEON
if (iCpu & WELS_CPU_NEON) {
pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_neon;
}
#endif
#ifdef HAVE_NEON_AARCH64
if (iCpu & WELS_CPU_NEON) {
pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_AArch64_neon;
}
#endif
pFunc->pWelsSetNonZeroCountFunc = WelsNonZeroCount_c;
pFunc->pWelsBlockZero16x16Func = WelsBlockZero16x16_c;
pFunc->pWelsBlockZero8x8Func = WelsBlockZero8x8_c;
//TO DO add neon and X86
pFunc->pWelsBlockZero8x8Func = WelsBlockZero8x8_c;
#ifdef HAVE_NEON
if (iCpu & WELS_CPU_NEON) {
pFunc->pWelsSetNonZeroCountFunc = WelsNonZeroCount_neon;
pFunc->pWelsBlockZero16x16Func = WelsBlockZero16x16_neon;
pFunc->pWelsBlockZero8x8Func = WelsBlockZero8x8_neon;
}
@ -1716,6 +1704,7 @@ void WelsBlockFuncInit (SBlockFunc* pFunc, int32_t iCpu) {
#ifdef HAVE_NEON_AARCH64
if (iCpu & WELS_CPU_NEON) {
pFunc->pWelsSetNonZeroCountFunc = WelsNonZeroCount_AArch64_neon;
pFunc->pWelsBlockZero16x16Func = WelsBlockZero16x16_AArch64_neon;
pFunc->pWelsBlockZero8x8Func = WelsBlockZero8x8_AArch64_neon;
}
@ -1723,6 +1712,7 @@ void WelsBlockFuncInit (SBlockFunc* pFunc, int32_t iCpu) {
#if defined(X86_ASM)
if (iCpu & WELS_CPU_SSE2) {
pFunc->pWelsSetNonZeroCountFunc = WelsNonZeroCount_sse2;
pFunc->pWelsBlockZero16x16Func = WelsBlockZero16x16_sse2;
pFunc->pWelsBlockZero8x8Func = WelsBlockZero8x8_sse2;
}
@ -1730,14 +1720,6 @@ void WelsBlockFuncInit (SBlockFunc* pFunc, int32_t iCpu) {
}
void SetNonZeroCount_c (int8_t* pNonZeroCount) {
int32_t i;
for (i = 0; i < 24; i++) {
pNonZeroCount[i] = !!pNonZeroCount[i];
}
}
void WelsBlockInit (int16_t* pBlock, int iW, int iH, int iStride, uint8_t uiVal) {
int32_t i;
int16_t* pDst = pBlock;

View File

@ -65,12 +65,10 @@ typedef struct TagDeblockingFilter {
extern "C" {
#endif//__cplusplus
#if defined(HAVE_NEON)
void WelsNonZeroCount_neon (int8_t* pNonZeroCount);
void DeblockingBSCalcEnc_neon (int8_t* pNzc, SMVUnitXY* pMv, int32_t iBoundryFlag, int32_t iMbStride,
uint8_t (*pBS)[4][4]);
#endif
#if defined(HAVE_NEON_AARCH64)
void WelsNonZeroCount_AArch64_neon (int8_t* pNonZeroCount);
void DeblockingBSCalcEnc_AArch64_neon (int8_t* pNzc, SMVUnitXY* pMv, int32_t iBoundryFlag, int32_t iMbStride,
uint8_t (*pBS)[4][4]);
#endif
@ -79,7 +77,6 @@ void DeblockingBSCalcEnc_AArch64_neon (int8_t* pNzc, SMVUnitXY* pMv, int32_t iBo
#endif//__cplusplus
void DeblockingInit (DeblockingFunc* pFunc, int32_t iCpu);
void WelsNonZeroCount_c (int8_t* pNonZeroCount);
void WelsBlockFuncInit (PSetNoneZeroCountZeroFunc* pfSetNZCZero, int32_t iCpu);
void PerformDeblockingFilter (sWelsEncCtx* pEnc);

View File

@ -774,13 +774,6 @@ void PerformDeblockingFilter (sWelsEncCtx* pEnc) {
}
}
void WelsNonZeroCount_c (int8_t* pNonZeroCount) {
int32_t i;
for (i = 0; i < 24; i++) {
pNonZeroCount[i] = !!pNonZeroCount[i];
}
}
void WelsBlockFuncInit (PSetNoneZeroCountZeroFunc* pfSetNZCZero, int32_t iCpu) {
*pfSetNZCZero = WelsNonZeroCount_c;
#ifdef HAVE_NEON
@ -793,6 +786,11 @@ void WelsBlockFuncInit (PSetNoneZeroCountZeroFunc* pfSetNZCZero, int32_t iCpu)
*pfSetNZCZero = WelsNonZeroCount_AArch64_neon;
}
#endif
#if defined(X86_ASM)
if (iCpu & WELS_CPU_SSE2) {
*pfSetNZCZero = WelsNonZeroCount_sse2;
}
#endif
}
void DeblockingInit (DeblockingFunc* pFunc, int32_t iCpu) {

View File

@ -1,7 +1,7 @@
#include <gtest/gtest.h>
#include "macros.h"
#include "decode_mb_aux.h"
#include "../../codec/decoder/core/src/decode_slice.cpp"
#include "deblocking.h"
using namespace WelsDec;
void IdctResAddPred_ref (uint8_t* pPred, const int32_t kiStride, int16_t* pRs) {
int16_t iSrc[16];
@ -98,7 +98,7 @@ TEST(DecoderDecodeMbAux, method) \
{\
int8_t iNonZeroCount[2][24];\
for(int32_t i = 0; i < 24; i++) {\
iNonZeroCount[0][i] = iNonZeroCount[1][i] = (rand() % 256)-128;\
iNonZeroCount[0][i] = iNonZeroCount[1][i] = (rand() % 25);\
}\
method(iNonZeroCount[0]);\
SetNonZeroCount_ref(iNonZeroCount[1]);\
@ -106,7 +106,7 @@ TEST(DecoderDecodeMbAux, method) \
ASSERT_EQ (iNonZeroCount[0][i], iNonZeroCount[1][i]);\
}\
for(int32_t i =0; i<24; i++) {\
iNonZeroCount[0][i] = iNonZeroCount[1][i] = -128;\
iNonZeroCount[0][i] = iNonZeroCount[1][i] = 0;\
}\
method(iNonZeroCount[0]);\
SetNonZeroCount_ref(iNonZeroCount[1]);\
@ -114,7 +114,7 @@ TEST(DecoderDecodeMbAux, method) \
ASSERT_EQ (iNonZeroCount[0][i], iNonZeroCount[1][i]);\
}\
for(int32_t i =0; i<24; i++) {\
iNonZeroCount[0][i] = iNonZeroCount[1][i] = 127;\
iNonZeroCount[0][i] = iNonZeroCount[1][i] = 16;\
}\
method(iNonZeroCount[0]);\
SetNonZeroCount_ref(iNonZeroCount[1]);\
@ -123,12 +123,16 @@ TEST(DecoderDecodeMbAux, method) \
}\
}
GENERATE_SETNONZEROCOUNT (SetNonZeroCount_c)
GENERATE_SETNONZEROCOUNT (WelsNonZeroCount_c)
#if defined(X86_ASM)
GENERATE_SETNONZEROCOUNT (WelsNonZeroCount_sse2)
#endif
#if defined(HAVE_NEON)
GENERATE_SETNONZEROCOUNT (SetNonZeroCount_neon)
GENERATE_SETNONZEROCOUNT (WelsNonZeroCount_neon)
#endif
#if defined(HAVE_NEON_AARCH64)
GENERATE_SETNONZEROCOUNT (SetNonZeroCount_AArch64_neon)
GENERATE_SETNONZEROCOUNT (WelsNonZeroCount_AArch64_neon)
#endif