Merge pull request #1694 from zhilwang/asm-SetNoneZero
Add asm code for NoneZeroCount and refine related code
This commit is contained in:
commit
ac08cc4b2f
@ -834,17 +834,12 @@ WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon
|
||||
|
||||
vld1.64 {d0-d2}, [r0]
|
||||
|
||||
vceq.s8 q0, q0, #0
|
||||
vceq.s8 d2, d2, #0
|
||||
vmvn q0, q0
|
||||
vmvn d2, d2
|
||||
vabs.s8 q0, q0
|
||||
vabs.s8 d2, d2
|
||||
|
||||
vst1.64 {d0-d2}, [r0]
|
||||
mov r1, #1
|
||||
vdup.8 q2, r1
|
||||
vld1.64 {d0,d1,d2}, [r0]
|
||||
vmin.s8 q0, q0, q2
|
||||
vmin.s8 d2, d2, d4
|
||||
vst1.64 {d0,d1,d2}, [r0]
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
#ifdef __APPLE__
|
||||
|
@ -553,16 +553,12 @@ bs_mv_check_jump1:
|
||||
#endif
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN WelsNonZeroCount_AArch64_neon
|
||||
mov w1, #1
|
||||
dup v3.8b, w1
|
||||
ld1 {v0.8b, v1.8b, v2.8b}, [x0]
|
||||
ins v0.d[1], v1.d[0]
|
||||
uzp1 v0.2d, v0.2d, v1.2d
|
||||
cmeq v0.16b, v0.16b, #0
|
||||
cmeq v2.8b, v2.8b, #0
|
||||
mvn v0.16b, v0.16b
|
||||
mvn v2.8b, v2.8b
|
||||
abs v0.16b, v0.16b
|
||||
abs v2.8b, v2.8b
|
||||
ins v1.d[0], v0.d[1]
|
||||
umin v0.8b, v0.8b, v3.8b
|
||||
umin v1.8b, v1.8b, v3.8b
|
||||
umin v2.8b, v2.8b, v3.8b
|
||||
st1 {v0.8b, v1.8b, v2.8b}, [x0]
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
|
@ -15,6 +15,8 @@ void DeblockChromaLt4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int
|
||||
int8_t* pTc);
|
||||
void DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
|
||||
|
||||
void WelsNonZeroCount_c (int8_t* pNonZeroCount);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif//__cplusplus
|
||||
@ -32,6 +34,7 @@ void DeblockChromaLt4V_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride,
|
||||
void DeblockChromaEq4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
|
||||
void DeblockChromaLt4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
|
||||
int8_t* pTC);
|
||||
void WelsNonZeroCount_sse2 (int8_t* pNonZeroCount);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON)
|
||||
@ -48,6 +51,7 @@ void DeblockChromaEq4V_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride,
|
||||
void DeblockChromaLt4H_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
|
||||
int8_t* pTC);
|
||||
void DeblockChromaEq4H_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
|
||||
void WelsNonZeroCount_neon (int8_t* pNonZeroCount);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
@ -61,6 +65,7 @@ void DeblockChromaEq4V_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t i
|
||||
void DeblockChromaLt4H_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
|
||||
int8_t* pTC);
|
||||
void DeblockChromaEq4H_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
|
||||
void WelsNonZeroCount_AArch64_neon (int8_t* pNonZeroCount);
|
||||
#endif
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
|
@ -180,6 +180,13 @@ void DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int
|
||||
DeblockChromaEq4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta);
|
||||
}
|
||||
|
||||
void WelsNonZeroCount_c (int8_t* pNonZeroCount) {
|
||||
int32_t i;
|
||||
for (i = 0; i < 24; i++) {
|
||||
pNonZeroCount[i] = !!pNonZeroCount[i];
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef X86_ASM
|
||||
extern "C" {
|
||||
void DeblockLumaLt4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
|
||||
|
@ -5276,3 +5276,14 @@ WELS_EXTERN DeblockLumaTransposeV2H_sse2
|
||||
pop r3
|
||||
ret
|
||||
|
||||
WELS_EXTERN WelsNonZeroCount_sse2
|
||||
%assign push_num 0
|
||||
LOAD_1_PARA
|
||||
movdqu xmm0, [r0]
|
||||
movq xmm1, [r0+16]
|
||||
WELS_DB1 xmm2
|
||||
pminub xmm0, xmm2
|
||||
pminub xmm1, xmm2
|
||||
movdqu [r0], xmm0
|
||||
movq [r0+16], xmm1
|
||||
ret
|
||||
|
@ -99,18 +99,6 @@
|
||||
// }
|
||||
.endm
|
||||
#endif
|
||||
// r0 int8_t* non_zero_count,
|
||||
WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon
|
||||
vld1.64 {d0-d2}, [r0]
|
||||
vceq.s8 q0, q0, #0
|
||||
vceq.s8 d2, d2, #0
|
||||
vmvn q0, q0
|
||||
vmvn d2, d2
|
||||
vabs.s8 q0, q0
|
||||
vabs.s8 d2, d2
|
||||
vst1.64 {d0-d2}, [r0]
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
// uint8_t *pred, const int32_t stride, int16_t *rs
|
||||
WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
|
||||
|
@ -100,20 +100,6 @@
|
||||
// }
|
||||
.endm
|
||||
#endif
|
||||
// x0 int8_t* non_zero_count,
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN SetNonZeroCount_AArch64_neon
|
||||
mov x1, x0
|
||||
ld1 {v0.16b}, [x1], #16
|
||||
ld1 {v1.8b}, [x1]
|
||||
cmeq v0.16b, v0.16b, #0
|
||||
cmeq v1.8b, v1.8b, #0
|
||||
mvn v0.16b, v0.16b
|
||||
mvn v1.8b, v1.8b
|
||||
abs v0.16b, v0.16b
|
||||
abs v1.8b, v1.8b
|
||||
st1 {v0.16b}, [x0], #16
|
||||
st1 {v1.8b}, [x0]
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
// uint8_t *pred, const int32_t stride, int16_t *rs
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN IdctResAddPred_AArch64_neon
|
||||
|
@ -73,13 +73,11 @@ void WelsBlockZero8x8_sse2(int16_t * block, int32_t stride);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON)
|
||||
void SetNonZeroCount_neon (int8_t* pNonZeroCount);
|
||||
void WelsBlockZero16x16_neon(int16_t * block, int32_t stride);
|
||||
void WelsBlockZero8x8_neon(int16_t * block, int32_t stride);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
void SetNonZeroCount_AArch64_neon (int8_t* pNonZeroCount);
|
||||
void WelsBlockZero16x16_AArch64_neon(int16_t * block, int32_t stride);
|
||||
void WelsBlockZero8x8_AArch64_neon(int16_t * block, int32_t stride);
|
||||
#endif
|
||||
@ -87,8 +85,6 @@ void WelsBlockZero8x8_AArch64_neon(int16_t * block, int32_t stride);
|
||||
}
|
||||
#endif//__cplusplus
|
||||
|
||||
void SetNonZeroCount_c (int8_t* pNonZeroCount);
|
||||
|
||||
void WelsBlockFuncInit (SBlockFunc* pFunc, int32_t iCpu);
|
||||
void WelsBlockZero16x16_c(int16_t * block, int32_t stride);
|
||||
void WelsBlockZero8x8_c(int16_t * block, int32_t stride);
|
||||
|
@ -1695,25 +1695,13 @@ int32_t WelsDecodeMbCavlcPSlice (PWelsDecoderContext pCtx, PNalUnit pNalCur, uin
|
||||
}
|
||||
|
||||
void WelsBlockFuncInit (SBlockFunc* pFunc, int32_t iCpu) {
|
||||
pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_c;
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
if (iCpu & WELS_CPU_NEON) {
|
||||
pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_neon;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON_AARCH64
|
||||
if (iCpu & WELS_CPU_NEON) {
|
||||
pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_AArch64_neon;
|
||||
}
|
||||
#endif
|
||||
|
||||
pFunc->pWelsSetNonZeroCountFunc = WelsNonZeroCount_c;
|
||||
pFunc->pWelsBlockZero16x16Func = WelsBlockZero16x16_c;
|
||||
pFunc->pWelsBlockZero8x8Func = WelsBlockZero8x8_c;
|
||||
//TO DO add neon and X86
|
||||
pFunc->pWelsBlockZero8x8Func = WelsBlockZero8x8_c;
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
if (iCpu & WELS_CPU_NEON) {
|
||||
pFunc->pWelsSetNonZeroCountFunc = WelsNonZeroCount_neon;
|
||||
pFunc->pWelsBlockZero16x16Func = WelsBlockZero16x16_neon;
|
||||
pFunc->pWelsBlockZero8x8Func = WelsBlockZero8x8_neon;
|
||||
}
|
||||
@ -1721,6 +1709,7 @@ void WelsBlockFuncInit (SBlockFunc* pFunc, int32_t iCpu) {
|
||||
|
||||
#ifdef HAVE_NEON_AARCH64
|
||||
if (iCpu & WELS_CPU_NEON) {
|
||||
pFunc->pWelsSetNonZeroCountFunc = WelsNonZeroCount_AArch64_neon;
|
||||
pFunc->pWelsBlockZero16x16Func = WelsBlockZero16x16_AArch64_neon;
|
||||
pFunc->pWelsBlockZero8x8Func = WelsBlockZero8x8_AArch64_neon;
|
||||
}
|
||||
@ -1728,6 +1717,7 @@ void WelsBlockFuncInit (SBlockFunc* pFunc, int32_t iCpu) {
|
||||
|
||||
#if defined(X86_ASM)
|
||||
if (iCpu & WELS_CPU_SSE2) {
|
||||
pFunc->pWelsSetNonZeroCountFunc = WelsNonZeroCount_sse2;
|
||||
pFunc->pWelsBlockZero16x16Func = WelsBlockZero16x16_sse2;
|
||||
pFunc->pWelsBlockZero8x8Func = WelsBlockZero8x8_sse2;
|
||||
}
|
||||
@ -1735,14 +1725,6 @@ void WelsBlockFuncInit (SBlockFunc* pFunc, int32_t iCpu) {
|
||||
|
||||
}
|
||||
|
||||
void SetNonZeroCount_c (int8_t* pNonZeroCount) {
|
||||
int32_t i;
|
||||
|
||||
for (i = 0; i < 24; i++) {
|
||||
pNonZeroCount[i] = !!pNonZeroCount[i];
|
||||
}
|
||||
}
|
||||
|
||||
void WelsBlockInit (int16_t* pBlock, int iW, int iH, int iStride, uint8_t uiVal) {
|
||||
int32_t i;
|
||||
int16_t* pDst = pBlock;
|
||||
|
@ -65,12 +65,10 @@ typedef struct TagDeblockingFilter {
|
||||
extern "C" {
|
||||
#endif//__cplusplus
|
||||
#if defined(HAVE_NEON)
|
||||
void WelsNonZeroCount_neon (int8_t* pNonZeroCount);
|
||||
void DeblockingBSCalcEnc_neon (int8_t* pNzc, SMVUnitXY* pMv, int32_t iBoundryFlag, int32_t iMbStride,
|
||||
uint8_t (*pBS)[4][4]);
|
||||
#endif
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
void WelsNonZeroCount_AArch64_neon (int8_t* pNonZeroCount);
|
||||
void DeblockingBSCalcEnc_AArch64_neon (int8_t* pNzc, SMVUnitXY* pMv, int32_t iBoundryFlag, int32_t iMbStride,
|
||||
uint8_t (*pBS)[4][4]);
|
||||
#endif
|
||||
@ -79,7 +77,6 @@ void DeblockingBSCalcEnc_AArch64_neon (int8_t* pNzc, SMVUnitXY* pMv, int32_t iBo
|
||||
#endif//__cplusplus
|
||||
void DeblockingInit (DeblockingFunc* pFunc, int32_t iCpu);
|
||||
|
||||
void WelsNonZeroCount_c (int8_t* pNonZeroCount);
|
||||
void WelsBlockFuncInit (PSetNoneZeroCountZeroFunc* pfSetNZCZero, int32_t iCpu);
|
||||
|
||||
void PerformDeblockingFilter (sWelsEncCtx* pEnc);
|
||||
|
@ -774,13 +774,6 @@ void PerformDeblockingFilter (sWelsEncCtx* pEnc) {
|
||||
}
|
||||
}
|
||||
|
||||
void WelsNonZeroCount_c (int8_t* pNonZeroCount) {
|
||||
int32_t i;
|
||||
|
||||
for (i = 0; i < 24; i++) {
|
||||
pNonZeroCount[i] = !!pNonZeroCount[i];
|
||||
}
|
||||
}
|
||||
void WelsBlockFuncInit (PSetNoneZeroCountZeroFunc* pfSetNZCZero, int32_t iCpu) {
|
||||
*pfSetNZCZero = WelsNonZeroCount_c;
|
||||
#ifdef HAVE_NEON
|
||||
@ -793,6 +786,11 @@ void WelsBlockFuncInit (PSetNoneZeroCountZeroFunc* pfSetNZCZero, int32_t iCpu)
|
||||
*pfSetNZCZero = WelsNonZeroCount_AArch64_neon;
|
||||
}
|
||||
#endif
|
||||
#if defined(X86_ASM)
|
||||
if (iCpu & WELS_CPU_SSE2) {
|
||||
*pfSetNZCZero = WelsNonZeroCount_sse2;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void DeblockingInit (DeblockingFunc* pFunc, int32_t iCpu) {
|
||||
|
@ -1,7 +1,7 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include "macros.h"
|
||||
#include "decode_mb_aux.h"
|
||||
#include "../../codec/decoder/core/src/decode_slice.cpp"
|
||||
#include "deblocking.h"
|
||||
using namespace WelsDec;
|
||||
void IdctResAddPred_ref (uint8_t* pPred, const int32_t kiStride, int16_t* pRs) {
|
||||
int16_t iSrc[16];
|
||||
@ -98,7 +98,7 @@ TEST(DecoderDecodeMbAux, method) \
|
||||
{\
|
||||
int8_t iNonZeroCount[2][24];\
|
||||
for(int32_t i = 0; i < 24; i++) {\
|
||||
iNonZeroCount[0][i] = iNonZeroCount[1][i] = (rand() % 256)-128;\
|
||||
iNonZeroCount[0][i] = iNonZeroCount[1][i] = (rand() % 25);\
|
||||
}\
|
||||
method(iNonZeroCount[0]);\
|
||||
SetNonZeroCount_ref(iNonZeroCount[1]);\
|
||||
@ -106,7 +106,7 @@ TEST(DecoderDecodeMbAux, method) \
|
||||
ASSERT_EQ (iNonZeroCount[0][i], iNonZeroCount[1][i]);\
|
||||
}\
|
||||
for(int32_t i =0; i<24; i++) {\
|
||||
iNonZeroCount[0][i] = iNonZeroCount[1][i] = -128;\
|
||||
iNonZeroCount[0][i] = iNonZeroCount[1][i] = 0;\
|
||||
}\
|
||||
method(iNonZeroCount[0]);\
|
||||
SetNonZeroCount_ref(iNonZeroCount[1]);\
|
||||
@ -114,7 +114,7 @@ TEST(DecoderDecodeMbAux, method) \
|
||||
ASSERT_EQ (iNonZeroCount[0][i], iNonZeroCount[1][i]);\
|
||||
}\
|
||||
for(int32_t i =0; i<24; i++) {\
|
||||
iNonZeroCount[0][i] = iNonZeroCount[1][i] = 127;\
|
||||
iNonZeroCount[0][i] = iNonZeroCount[1][i] = 16;\
|
||||
}\
|
||||
method(iNonZeroCount[0]);\
|
||||
SetNonZeroCount_ref(iNonZeroCount[1]);\
|
||||
@ -123,12 +123,16 @@ TEST(DecoderDecodeMbAux, method) \
|
||||
}\
|
||||
}
|
||||
|
||||
GENERATE_SETNONZEROCOUNT (SetNonZeroCount_c)
|
||||
GENERATE_SETNONZEROCOUNT (WelsNonZeroCount_c)
|
||||
|
||||
#if defined(X86_ASM)
|
||||
GENERATE_SETNONZEROCOUNT (WelsNonZeroCount_sse2)
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON)
|
||||
GENERATE_SETNONZEROCOUNT (SetNonZeroCount_neon)
|
||||
GENERATE_SETNONZEROCOUNT (WelsNonZeroCount_neon)
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
GENERATE_SETNONZEROCOUNT (SetNonZeroCount_AArch64_neon)
|
||||
GENERATE_SETNONZEROCOUNT (WelsNonZeroCount_AArch64_neon)
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user