Add x86 32/64bit asm code for Scc_hash

This commit is contained in:
zhiliang wang 2014-08-14 18:41:52 +08:00
parent 9d2e1a9384
commit b35f5797de
4 changed files with 420 additions and 0 deletions

View File

@ -252,6 +252,10 @@ void SumOf16x16BlockOfFrame_c (uint8_t* pRefPicture, const int32_t kiWidth, cons
#ifdef X86_ASM
extern "C"
{
void InitializeHashforFeature_sse2 (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
void FillQpelLocationByFeatureValue_sse2 (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,
uint16_t** pFeatureValuePointerList);
int32_t SumOf8x8SingleBlock_sse2 (uint8_t* pRef, const int32_t kiRefStride);
int32_t SumOf16x16SingleBlock_sse2 (uint8_t* pRef, const int32_t kiRefStride);
void SumOf8x8BlockOfFrame_sse2 (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,

View File

@ -107,6 +107,8 @@ void WelsInitMeFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag, bool bScre
#if defined (X86_ASM)
if (uiCpuFlag & WELS_CPU_SSE2) {
//for feature search
pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_sse2;
pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_sse2;
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_sse2;
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_sse2;
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?

View File

@ -31,6 +31,16 @@
;*************************************************************************/
%include "asm_inc.asm"
;***********************************************************************
; Local Data (Read Only)
;***********************************************************************
SECTION .rodata align=16
ALIGN 16
mv_x_inc_x4 dw 0x10, 0x10, 0x10, 0x10
mv_y_inc_x4 dw 0x04, 0x04, 0x04, 0x04
mx_x_offset_x4 dw 0x00, 0x04, 0x08, 0x0C
SECTION .text
%ifdef X86_32
;**********************************************************************************************************************
@ -661,6 +671,159 @@ WIDTH_LOOP_X16_SSE4:
%undef tmp_width
ret
;-----------------------------------------------------------------------------------------------------------------------------
; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
;-----------------------------------------------------------------------------------------------------------------------------
WELS_EXTERN FillQpelLocationByFeatureValue_sse2
push esi
push edi
push ebx
push ebp
%define _ps 16 ; push size
%define _ls 4 ; local size
%define sum_ref esp+_ps+_ls+4
%define pos_list esp+_ps+_ls+16
%define width esp+_ps+_ls+8
%define height esp+_ps+_ls+12
%define i_height esp
sub esp, _ls
mov esi, [sum_ref]
mov edi, [pos_list]
mov ebp, [width]
mov ebx, [height]
mov [i_height], ebx
movq xmm7, [mv_x_inc_x4] ; x_qpel inc
movq xmm6, [mv_y_inc_x4] ; y_qpel inc
movq xmm5, [mx_x_offset_x4] ; x_qpel vector
pxor xmm4, xmm4
pxor xmm3, xmm3 ; y_qpel vector
HASH_HEIGHT_LOOP_SSE2:
movdqa xmm2, xmm5 ; x_qpel vector
mov ecx, ebp
HASH_WIDTH_LOOP_SSE2:
movq xmm0, [esi] ; load x8 sum
punpcklwd xmm0, xmm4
movdqa xmm1, xmm2
punpcklwd xmm1, xmm3
%rep 3
movd edx, xmm0
lea ebx, [edi+edx*4]
mov eax, [ebx]
movd [eax], xmm1
mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation
lea eax, [eax+4]
mov [ebx], eax
psrldq xmm1, 4
psrldq xmm0, 4
%endrep
movd edx, xmm0
lea ebx, [edi+edx*4]
mov eax, [ebx]
movd [eax], xmm1
mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation
lea eax, [eax+4]
mov [ebx], eax
paddw xmm2, xmm7
lea esi, [esi+8]
sub ecx, 4
jnz near HASH_WIDTH_LOOP_SSE2
paddw xmm3, xmm6
dec dword [i_height]
jnz near HASH_HEIGHT_LOOP_SSE2
add esp, _ls
%undef _ps
%undef _ls
%undef sum_ref
%undef pos_list
%undef width
%undef height
%undef i_height
pop ebp
pop ebx
pop edi
pop esi
ret
;---------------------------------------------------------------------------------------------------------------------------------------------------
; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
; uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList )
;---------------------------------------------------------------------------------------------------------------------------------------------------
WELS_EXTERN InitializeHashforFeature_sse2
push ebx
push esi
push edi
push ebp
%define _ps 16 ; push size
mov edi, [esp+_ps+16] ; pPositionOfSum
mov ebp, [esp+_ps+20] ; sum_idx_list
mov esi, [esp+_ps+4] ; pTimesOfSum
mov ebx, [esp+_ps+8] ; pBuf
mov edx, [esp+_ps+12] ; list_sz
sar edx, 2
mov ecx, 0
pxor xmm7, xmm7
hash_assign_loop_x4_sse2:
movdqa xmm0, [esi+ecx]
pslld xmm0, 2
movdqa xmm1, xmm0
pcmpeqd xmm1, xmm7
movmskps eax, xmm1
cmp eax, 0x0f
je near hash_assign_with_copy_sse2
%assign x 0
%rep 4
lea eax, [edi+ecx+x]
mov [eax], ebx
lea eax, [ebp+ecx+x]
mov [eax], ebx
movd eax, xmm0
add ebx, eax
psrldq xmm0, 4
%assign x x+4
%endrep
jmp near assign_next_sse2
hash_assign_with_copy_sse2:
movd xmm1, ebx
pshufd xmm2, xmm1, 0
movdqa [edi+ecx], xmm2
movdqa [ebp+ecx], xmm2
assign_next_sse2:
add ecx, 16
dec edx
jnz near hash_assign_loop_x4_sse2
mov edx, [esp+_ps+12] ; list_sz
and edx, 3
jz near hash_assign_no_rem_sse2
hash_assign_loop_x4_rem_sse2:
lea eax, [edi+ecx]
mov [eax], ebx
lea eax, [ebp+ecx]
mov [eax], ebx
mov eax, [esi+ecx]
sal eax, 2
add ebx, eax
add ecx, 4
dec edx
jnz near hash_assign_loop_x4_rem_sse2
hash_assign_no_rem_sse2:
%undef _ps
pop ebp
pop edi
pop esi
pop ebx
ret
%else
;**********************************************************************************************************************
@ -1222,6 +1385,146 @@ WIDTH_LOOP_X16_SSE4:
LOAD_6_PARA_POP
ret
;-----------------------------------------------------------------------------------------------------------------------------
; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
;-----------------------------------------------------------------------------------------------------------------------------
WELS_EXTERN FillQpelLocationByFeatureValue_sse2
%assign push_num 0
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r2, r2d
push r12
push r13
;mov esi, [sum_ref] r0:esi
;mov edi, [pos_list] r3:edi
;mov ebp, [width] r1:ebp
;mov ebx, [height] r2:ebx
;mov [i_height], ebx
mov r12, r2
movq xmm7, [mv_x_inc_x4] ; x_qpel inc
movq xmm6, [mv_y_inc_x4] ; y_qpel inc
movq xmm5, [mx_x_offset_x4] ; x_qpel vector
pxor xmm4, xmm4
pxor xmm3, xmm3 ; y_qpel vector
HASH_HEIGHT_LOOP_SSE2:
movdqa xmm2, xmm5 ; x_qpel vector
mov r4, r1
HASH_WIDTH_LOOP_SSE2:
movq xmm0, [r0] ; load x8 sum
punpcklwd xmm0, xmm4
movdqa xmm1, xmm2
punpcklwd xmm1, xmm3
%rep 3
movd r2d, xmm0 ;edx:r3
lea r5, [r3+r2*8] ;ebx:r5
mov r6, [r5] ;eax:r6
movd [r6], xmm1
mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation
lea r6, [r6+4]
mov [r5], r6
psrldq xmm1, 4
psrldq xmm0, 4
%endrep
movd r2d, xmm0
lea r5, [r3+r2*8] ;ebx:r5
mov r6, [r5] ;eax:r6
movd [r6], xmm1
mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation
lea r6, [r6+4]
mov [r5], r6
paddw xmm2, xmm7
lea r0, [r0+8]
sub r4, 4
jnz near HASH_WIDTH_LOOP_SSE2
paddw xmm3, xmm6
dec r12
jnz near HASH_HEIGHT_LOOP_SSE2
pop r13
pop r12
POP_XMM
ret
;---------------------------------------------------------------------------------------------------------------------------------------------------
; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
; uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
;uint16_t** pPositionOfSum, uint16_t** sum_idx_list, uint32_t* pTimesOfSum, uint16_t* pBuf, const int32_t list_sz )
;---------------------------------------------------------------------------------------------------------------------------------------------------
WELS_EXTERN InitializeHashforFeature_sse2
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r2, r2d
push r12
push r13
;mov edi, [esp+_ps+4] ; pPositionOfSum r3:edi
;mov ebp, [esp+_ps+8] ; sum_idx_list r4:ebp
;mov esi, [esp+_ps+12] ; pTimesOfSum r0:esi
;mov ebx, [esp+_ps+16] ; pBuf r1:ebx
;mov edx, [esp+_ps+20] ; list_sz r2:edx
mov r12, r2
sar r2, 2
mov r5, 0 ;r5:ecx
xor r6, r6
pxor xmm3, xmm3
hash_assign_loop_x4_sse2:
movdqa xmm0, [r0+r5]
pslld xmm0, 2
movdqa xmm1, xmm0
pcmpeqd xmm1, xmm3
movmskps r6, xmm1
cmp r6, 0x0f
jz near hash_assign_with_copy_sse2
%assign x 0
%rep 4
lea r13, [r3+r5*2+x]
mov [r13], r1
lea r13, [r4+r5*2+x]
mov [r13], r1
movd r6d, xmm0
add r1, r6
psrldq xmm0, 4
%assign x x+8
%endrep
jmp near assign_next_sse2
hash_assign_with_copy_sse2:
movq xmm1, r1
pshufd xmm2, xmm1, 01000100b
movdqa [r3+r5*2], xmm2
movdqa [r4+r5*2], xmm2
movdqa [r3+r5*2+16], xmm2
movdqa [r4+r5*2+16], xmm2
assign_next_sse2:
add r5, 16
dec r2
jnz near hash_assign_loop_x4_sse2
and r12, 3
jz near hash_assign_no_rem_sse2
hash_assign_loop_x4_rem_sse2:
lea r13, [r3+r5*2]
mov [r13], r1
lea r13, [r4+r5*2]
mov [r13], r1
mov r6d, [r0+r5]
sal r6, 2
add r1, r6
add r5, 4
dec r12
jnz near hash_assign_loop_x4_rem_sse2
hash_assign_no_rem_sse2:
pop r13
pop r12
ret
%endif
;**********************************************************************************************************************************

View File

@ -6,6 +6,7 @@
#include "cpu_core.h"
#include "cpu.h"
#include "macros.h"
#include "ls_defines.h"
#include "svc_motion_estimate.h"
using namespace WelsEnc;
@ -77,6 +78,33 @@ void SumOf16x16BlockOfFrame_ref (uint8_t* pRefPicture, const int32_t kiWidth, co
}
}
void InitializeHashforFeature_ref (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList) {
//assign location pointer
uint16_t* pBufPos = pBuf;
for (int32_t i = 0 ; i < kiListSize; ++i) {
pLocationOfFeature[i] =
pFeatureValuePointerList[i] = pBufPos;
pBufPos += (pTimesOfFeatureValue[i] << 1);
}
}
void FillQpelLocationByFeatureValue_ref (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,
uint16_t** pFeatureValuePointerList) {
//assign each pixel's position
uint16_t* pSrcPointer = pFeatureOfBlock;
int32_t iQpelY = 0;
for (int32_t y = 0; y < kiHeight; y++) {
for (int32_t x = 0; x < kiWidth; x++) {
uint16_t uiFeature = pSrcPointer[x];
ST32 (&pFeatureValuePointerList[uiFeature][0], ((iQpelY << 16) | (x << 2)));
pFeatureValuePointerList[uiFeature] += 2;
}
iQpelY += 4;
pSrcPointer += kiWidth;
}
}
#define GENERATE_SumOfSingleBlock(anchor, method) \
TEST (SVC_ME_FunTest, method) {\
ENFORCE_STACK_ALIGN_1D (uint8_t, uiRefBuf, 16*320, 16);\
@ -136,6 +164,89 @@ delete[] pFeatureOfBlockBuff1; \
delete[] pFeatureOfBlockBuff2; \
}
#define GENERATE_InitializeHashforFeature(anchor, method, kiWidth, kiHeight) \
TEST (SVC_ME_FunTest, method##_##kiWidth##x##kiHeight) {\
ENFORCE_NEW_ALIGN_1D (uint8_t, pRefPicture, pRefPictureBuff, ((kiHeight+16)*((((kiWidth+15)>>4)<<4)+16)), 16) \
ENFORCE_NEW_ALIGN_1D (uint16_t, pFeatureOfBlock, pFeatureOfBlockBuff, (kiWidth*kiHeight), 16) \
ENFORCE_NEW_ALIGN_1D (uint16_t, pLocation1, pLocationBuff1, (kiWidth*kiHeight)*2, 16) \
ENFORCE_NEW_ALIGN_1D (uint32_t, pTimesOfFeatureValue, pTimesOfFeatureValueBuff, 65536, 16) \
ENFORCE_NEW_ALIGN_1D (uint16_t*, pLocationFeature0, pLocationFeature0Buff, 65536, 16) \
ENFORCE_NEW_ALIGN_1D (uint16_t*, pLocationFeature1, pLocationFeature1Buff, 65536, 16) \
ENFORCE_NEW_ALIGN_1D (uint16_t*, pFeaturePointValueList0, pFeaturePointValueList0Buff, 65536, 16) \
ENFORCE_NEW_ALIGN_1D (uint16_t*, pFeaturePointValueList1, pFeaturePointValueList1Buff, 65536, 16) \
for (int32_t k = 0; k < SVC_ME_TEST_NUM; k++) { \
FillWithRandomData (pRefPicture,(kiHeight+16)*((((kiWidth+15)>>4)<<4)+16)); \
memset(pTimesOfFeatureValue, 0, 65536*sizeof(uint32_t)); \
memset(pLocationFeature0, 0, 65536*sizeof(uint16_t*)); \
memset(pFeaturePointValueList0, 0, 65536*sizeof(uint16_t*)); \
memset(pLocationFeature1, 0, 65536*sizeof(uint16_t*)); \
memset(pFeaturePointValueList1, 0, 65536*sizeof(uint16_t*)); \
SumOf8x8BlockOfFrame_c (pRefPicture,kiWidth,kiHeight,((((kiWidth+15)>>4)<<4)+16),pFeatureOfBlock,pTimesOfFeatureValue); \
int32_t iActSize = 65536;\
anchor ( pTimesOfFeatureValue, pLocation1, iActSize, pLocationFeature0, pFeaturePointValueList0);\
method ( pTimesOfFeatureValue, pLocation1, iActSize, pLocationFeature1, pFeaturePointValueList1); \
for(int32_t j =0; j<65536; j++) { \
EXPECT_EQ (pLocationFeature0[j], pLocationFeature1[j]); \
EXPECT_EQ (pFeaturePointValueList0[j], pFeaturePointValueList1[j]); \
} \
} \
delete[] pRefPictureBuff; \
delete[] pFeatureOfBlockBuff; \
delete[] pLocationBuff1; \
delete[] pTimesOfFeatureValueBuff; \
delete[] pLocationFeature0Buff; \
delete[] pFeaturePointValueList0Buff; \
delete[] pLocationFeature1Buff; \
delete[] pFeaturePointValueList1Buff; \
}
#define GENERATE_FillQpelLocationByFeatureValue(anchor, method, kiWidth, kiHeight) \
TEST (SVC_ME_FunTest, method##_##kiWidth##x##kiHeight) {\
ENFORCE_NEW_ALIGN_1D (uint8_t, pRefPicture, pRefPictureBuff, ((kiHeight+16)*((((kiWidth+15)>>4)<<4)+16)), 16) \
ENFORCE_NEW_ALIGN_1D (uint16_t, pFeatureOfBlock, pFeatureOfBlockBuff, (kiWidth*kiHeight), 16) \
ENFORCE_NEW_ALIGN_1D (uint16_t, pLocation1, pLocationBuff1, (kiWidth*kiHeight)*2, 16) \
ENFORCE_NEW_ALIGN_1D (uint16_t, pLocation2, pLocationBuff2, (kiWidth*kiHeight)*2, 16) \
ENFORCE_NEW_ALIGN_1D (uint32_t, pTimesOfFeatureValue, pTimesOfFeatureValueBuff, 65536, 16) \
ENFORCE_NEW_ALIGN_1D (uint16_t*, pLocationFeature0, pLocationFeature0Buff, 65536, 16) \
ENFORCE_NEW_ALIGN_1D (uint16_t*, pLocationFeature1, pLocationFeature1Buff, 65536, 16) \
ENFORCE_NEW_ALIGN_1D (uint16_t*, pFeaturePointValueList0, pFeaturePointValueList0Buff, 65536, 16) \
ENFORCE_NEW_ALIGN_1D (uint16_t*, pFeaturePointValueList1, pFeaturePointValueList1Buff, 65536, 16) \
for (int32_t k = 0; k < SVC_ME_TEST_NUM; k++) { \
FillWithRandomData (pRefPicture,(kiHeight+16)*((((kiWidth+15)>>4)<<4)+16)); \
memset(pTimesOfFeatureValue, 0, 65536*sizeof(uint32_t)); \
memset(pLocationFeature0, 0, 65536*sizeof(uint16_t*)); \
memset(pFeaturePointValueList0, 0, 65536*sizeof(uint16_t*)); \
memset(pLocationFeature1, 0, 65536*sizeof(uint16_t*)); \
memset(pFeaturePointValueList1, 0, 65536*sizeof(uint16_t*)); \
SumOf8x8BlockOfFrame_c (pRefPicture,kiWidth,kiHeight,((((kiWidth+15)>>4)<<4)+16),pFeatureOfBlock,pTimesOfFeatureValue); \
int32_t iActSize = 65536; \
InitializeHashforFeature_c ( pTimesOfFeatureValue, pLocation1, iActSize, pLocationFeature0, pFeaturePointValueList0); \
InitializeHashforFeature_c( pTimesOfFeatureValue, pLocation2, iActSize, pLocationFeature1, pFeaturePointValueList1); \
anchor(pFeatureOfBlock, kiWidth, kiHeight, pFeaturePointValueList0); \
method(pFeatureOfBlock, kiWidth, kiHeight, pFeaturePointValueList1); \
for(int32_t j =0; j<kiWidth*kiHeight*2; j++) { \
EXPECT_EQ (pLocation1[j], pLocation2[j]); \
} \
} \
delete[] pRefPictureBuff; \
delete[] pFeatureOfBlockBuff; \
delete[] pLocationBuff1; \
delete[] pLocationBuff2; \
delete[] pTimesOfFeatureValueBuff; \
delete[] pLocationFeature0Buff; \
delete[] pFeaturePointValueList0Buff; \
delete[] pLocationFeature1Buff; \
delete[] pFeaturePointValueList1Buff; \
}
GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_c, 10, 10)
GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_c, 16, 16)
#ifdef X86_ASM
GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_sse2, 10, 10)
GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_sse2, 16, 16)
#endif
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 1, 1)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 1, 1)
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 1, 320)