refine format and add UT cases
This commit is contained in:
parent
76863f977a
commit
ef88889404
@ -676,154 +676,154 @@ WIDTH_LOOP_X16_SSE4:
|
||||
; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
|
||||
;-----------------------------------------------------------------------------------------------------------------------------
|
||||
WELS_EXTERN FillQpelLocationByFeatureValue_sse2
|
||||
push esi
|
||||
push edi
|
||||
push ebx
|
||||
push ebp
|
||||
push esi
|
||||
push edi
|
||||
push ebx
|
||||
push ebp
|
||||
|
||||
%define _ps 16 ; push size
|
||||
%define _ls 4 ; local size
|
||||
%define sum_ref esp+_ps+_ls+4
|
||||
%define pos_list esp+_ps+_ls+16
|
||||
%define width esp+_ps+_ls+8
|
||||
%define height esp+_ps+_ls+12
|
||||
%define i_height esp
|
||||
sub esp, _ls
|
||||
%define _ps 16 ; push size
|
||||
%define _ls 4 ; local size
|
||||
%define sum_ref esp+_ps+_ls+4
|
||||
%define pos_list esp+_ps+_ls+16
|
||||
%define width esp+_ps+_ls+8
|
||||
%define height esp+_ps+_ls+12
|
||||
%define i_height esp
|
||||
sub esp, _ls
|
||||
|
||||
mov esi, [sum_ref]
|
||||
mov edi, [pos_list]
|
||||
mov ebp, [width]
|
||||
mov ebx, [height]
|
||||
mov [i_height], ebx
|
||||
mov esi, [sum_ref]
|
||||
mov edi, [pos_list]
|
||||
mov ebp, [width]
|
||||
mov ebx, [height]
|
||||
mov [i_height], ebx
|
||||
|
||||
movq xmm7, [mv_x_inc_x4] ; x_qpel inc
|
||||
movq xmm6, [mv_y_inc_x4] ; y_qpel inc
|
||||
movq xmm5, [mx_x_offset_x4] ; x_qpel vector
|
||||
pxor xmm4, xmm4
|
||||
pxor xmm3, xmm3 ; y_qpel vector
|
||||
movq xmm7, [mv_x_inc_x4] ; x_qpel inc
|
||||
movq xmm6, [mv_y_inc_x4] ; y_qpel inc
|
||||
movq xmm5, [mx_x_offset_x4] ; x_qpel vector
|
||||
pxor xmm4, xmm4
|
||||
pxor xmm3, xmm3 ; y_qpel vector
|
||||
HASH_HEIGHT_LOOP_SSE2:
|
||||
movdqa xmm2, xmm5 ; x_qpel vector
|
||||
mov ecx, ebp
|
||||
movdqa xmm2, xmm5 ; x_qpel vector
|
||||
mov ecx, ebp
|
||||
HASH_WIDTH_LOOP_SSE2:
|
||||
movq xmm0, [esi] ; load x8 sum
|
||||
punpcklwd xmm0, xmm4
|
||||
movdqa xmm1, xmm2
|
||||
punpcklwd xmm1, xmm3
|
||||
movq xmm0, [esi] ; load x8 sum
|
||||
punpcklwd xmm0, xmm4
|
||||
movdqa xmm1, xmm2
|
||||
punpcklwd xmm1, xmm3
|
||||
%rep 3
|
||||
movd edx, xmm0
|
||||
lea ebx, [edi+edx*4]
|
||||
mov eax, [ebx]
|
||||
movd [eax], xmm1
|
||||
mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation
|
||||
lea eax, [eax+4]
|
||||
mov [ebx], eax
|
||||
psrldq xmm1, 4
|
||||
psrldq xmm0, 4
|
||||
movd edx, xmm0
|
||||
lea ebx, [edi+edx*4]
|
||||
mov eax, [ebx]
|
||||
movd [eax], xmm1
|
||||
mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation
|
||||
lea eax, [eax+4]
|
||||
mov [ebx], eax
|
||||
psrldq xmm1, 4
|
||||
psrldq xmm0, 4
|
||||
%endrep
|
||||
movd edx, xmm0
|
||||
lea ebx, [edi+edx*4]
|
||||
mov eax, [ebx]
|
||||
movd [eax], xmm1
|
||||
mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation
|
||||
lea eax, [eax+4]
|
||||
mov [ebx], eax
|
||||
movd edx, xmm0
|
||||
lea ebx, [edi+edx*4]
|
||||
mov eax, [ebx]
|
||||
movd [eax], xmm1
|
||||
mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation
|
||||
lea eax, [eax+4]
|
||||
mov [ebx], eax
|
||||
|
||||
paddw xmm2, xmm7
|
||||
lea esi, [esi+8]
|
||||
sub ecx, 4
|
||||
jnz near HASH_WIDTH_LOOP_SSE2
|
||||
paddw xmm3, xmm6
|
||||
dec dword [i_height]
|
||||
jnz near HASH_HEIGHT_LOOP_SSE2
|
||||
paddw xmm2, xmm7
|
||||
lea esi, [esi+8]
|
||||
sub ecx, 4
|
||||
jnz near HASH_WIDTH_LOOP_SSE2
|
||||
paddw xmm3, xmm6
|
||||
dec dword [i_height]
|
||||
jnz near HASH_HEIGHT_LOOP_SSE2
|
||||
|
||||
add esp, _ls
|
||||
%undef _ps
|
||||
%undef _ls
|
||||
%undef sum_ref
|
||||
%undef pos_list
|
||||
%undef width
|
||||
%undef height
|
||||
%undef i_height
|
||||
pop ebp
|
||||
pop ebx
|
||||
pop edi
|
||||
pop esi
|
||||
ret
|
||||
add esp, _ls
|
||||
%undef _ps
|
||||
%undef _ls
|
||||
%undef sum_ref
|
||||
%undef pos_list
|
||||
%undef width
|
||||
%undef height
|
||||
%undef i_height
|
||||
pop ebp
|
||||
pop ebx
|
||||
pop edi
|
||||
pop esi
|
||||
ret
|
||||
|
||||
;---------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
|
||||
; uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList )
|
||||
;---------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
WELS_EXTERN InitializeHashforFeature_sse2
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
push ebp
|
||||
%define _ps 16 ; push size
|
||||
mov edi, [esp+_ps+16] ; pPositionOfSum
|
||||
mov ebp, [esp+_ps+20] ; sum_idx_list
|
||||
mov esi, [esp+_ps+4] ; pTimesOfSum
|
||||
mov ebx, [esp+_ps+8] ; pBuf
|
||||
mov edx, [esp+_ps+12] ; list_sz
|
||||
sar edx, 2
|
||||
mov ecx, 0
|
||||
pxor xmm7, xmm7
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
push ebp
|
||||
%define _ps 16 ; push size
|
||||
mov edi, [esp+_ps+16] ; pPositionOfSum
|
||||
mov ebp, [esp+_ps+20] ; sum_idx_list
|
||||
mov esi, [esp+_ps+4] ; pTimesOfSum
|
||||
mov ebx, [esp+_ps+8] ; pBuf
|
||||
mov edx, [esp+_ps+12] ; list_sz
|
||||
sar edx, 2
|
||||
mov ecx, 0
|
||||
pxor xmm7, xmm7
|
||||
hash_assign_loop_x4_sse2:
|
||||
movdqa xmm0, [esi+ecx]
|
||||
pslld xmm0, 2
|
||||
movdqa xmm0, [esi+ecx]
|
||||
pslld xmm0, 2
|
||||
|
||||
movdqa xmm1, xmm0
|
||||
pcmpeqd xmm1, xmm7
|
||||
movmskps eax, xmm1
|
||||
movdqa xmm1, xmm0
|
||||
pcmpeqd xmm1, xmm7
|
||||
movmskps eax, xmm1
|
||||
cmp eax, 0x0f
|
||||
je near hash_assign_with_copy_sse2
|
||||
je near hash_assign_with_copy_sse2
|
||||
|
||||
%assign x 0
|
||||
%rep 4
|
||||
lea eax, [edi+ecx+x]
|
||||
mov [eax], ebx
|
||||
lea eax, [ebp+ecx+x]
|
||||
mov [eax], ebx
|
||||
movd eax, xmm0
|
||||
add ebx, eax
|
||||
psrldq xmm0, 4
|
||||
lea eax, [edi+ecx+x]
|
||||
mov [eax], ebx
|
||||
lea eax, [ebp+ecx+x]
|
||||
mov [eax], ebx
|
||||
movd eax, xmm0
|
||||
add ebx, eax
|
||||
psrldq xmm0, 4
|
||||
%assign x x+4
|
||||
%endrep
|
||||
jmp near assign_next_sse2
|
||||
jmp near assign_next_sse2
|
||||
|
||||
hash_assign_with_copy_sse2:
|
||||
movd xmm1, ebx
|
||||
pshufd xmm2, xmm1, 0
|
||||
movdqa [edi+ecx], xmm2
|
||||
movdqa [ebp+ecx], xmm2
|
||||
movd xmm1, ebx
|
||||
pshufd xmm2, xmm1, 0
|
||||
movdqa [edi+ecx], xmm2
|
||||
movdqa [ebp+ecx], xmm2
|
||||
|
||||
assign_next_sse2:
|
||||
add ecx, 16
|
||||
dec edx
|
||||
jnz near hash_assign_loop_x4_sse2
|
||||
add ecx, 16
|
||||
dec edx
|
||||
jnz near hash_assign_loop_x4_sse2
|
||||
|
||||
mov edx, [esp+_ps+12] ; list_sz
|
||||
and edx, 3
|
||||
jz near hash_assign_no_rem_sse2
|
||||
mov edx, [esp+_ps+12] ; list_sz
|
||||
and edx, 3
|
||||
jz near hash_assign_no_rem_sse2
|
||||
hash_assign_loop_x4_rem_sse2:
|
||||
lea eax, [edi+ecx]
|
||||
mov [eax], ebx
|
||||
lea eax, [ebp+ecx]
|
||||
mov [eax], ebx
|
||||
mov eax, [esi+ecx]
|
||||
sal eax, 2
|
||||
add ebx, eax
|
||||
add ecx, 4
|
||||
dec edx
|
||||
jnz near hash_assign_loop_x4_rem_sse2
|
||||
lea eax, [edi+ecx]
|
||||
mov [eax], ebx
|
||||
lea eax, [ebp+ecx]
|
||||
mov [eax], ebx
|
||||
mov eax, [esi+ecx]
|
||||
sal eax, 2
|
||||
add ebx, eax
|
||||
add ecx, 4
|
||||
dec edx
|
||||
jnz near hash_assign_loop_x4_rem_sse2
|
||||
|
||||
hash_assign_no_rem_sse2:
|
||||
%undef _ps
|
||||
pop ebp
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
ret
|
||||
%undef _ps
|
||||
pop ebp
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
ret
|
||||
%else
|
||||
|
||||
;**********************************************************************************************************************
|
||||
@ -1398,50 +1398,50 @@ WELS_EXTERN FillQpelLocationByFeatureValue_sse2
|
||||
push r13
|
||||
mov r12, r2
|
||||
|
||||
movq xmm7, [mv_x_inc_x4] ; x_qpel inc
|
||||
movq xmm6, [mv_y_inc_x4] ; y_qpel inc
|
||||
movq xmm5, [mx_x_offset_x4] ; x_qpel vector
|
||||
pxor xmm4, xmm4
|
||||
pxor xmm3, xmm3 ; y_qpel vector
|
||||
movq xmm7, [mv_x_inc_x4] ; x_qpel inc
|
||||
movq xmm6, [mv_y_inc_x4] ; y_qpel inc
|
||||
movq xmm5, [mx_x_offset_x4] ; x_qpel vector
|
||||
pxor xmm4, xmm4
|
||||
pxor xmm3, xmm3 ; y_qpel vector
|
||||
HASH_HEIGHT_LOOP_SSE2:
|
||||
movdqa xmm2, xmm5 ; x_qpel vector
|
||||
mov r4, r1
|
||||
movdqa xmm2, xmm5 ; x_qpel vector
|
||||
mov r4, r1
|
||||
HASH_WIDTH_LOOP_SSE2:
|
||||
movq xmm0, [r0] ; load x8 sum
|
||||
punpcklwd xmm0, xmm4
|
||||
movdqa xmm1, xmm2
|
||||
punpcklwd xmm1, xmm3
|
||||
movq xmm0, [r0] ; load x8 sum
|
||||
punpcklwd xmm0, xmm4
|
||||
movdqa xmm1, xmm2
|
||||
punpcklwd xmm1, xmm3
|
||||
%rep 3
|
||||
movd r2d, xmm0 ;edx:r3
|
||||
lea r5, [r3+r2*8] ;ebx:r5
|
||||
mov r6, [r5] ;eax:r6
|
||||
movd [r6], xmm1
|
||||
mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation
|
||||
lea r6, [r6+4]
|
||||
mov [r5], r6
|
||||
psrldq xmm1, 4
|
||||
psrldq xmm0, 4
|
||||
movd r2d, xmm0 ;edx:r3
|
||||
lea r5, [r3+r2*8] ;ebx:r5
|
||||
mov r6, [r5] ;eax:r6
|
||||
movd [r6], xmm1
|
||||
mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation
|
||||
lea r6, [r6+4]
|
||||
mov [r5], r6
|
||||
psrldq xmm1, 4
|
||||
psrldq xmm0, 4
|
||||
%endrep
|
||||
movd r2d, xmm0
|
||||
lea r5, [r3+r2*8] ;ebx:r5
|
||||
mov r6, [r5] ;eax:r6
|
||||
movd [r6], xmm1
|
||||
mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation
|
||||
lea r6, [r6+4]
|
||||
mov [r5], r6
|
||||
movd r2d, xmm0
|
||||
lea r5, [r3+r2*8] ;ebx:r5
|
||||
mov r6, [r5] ;eax:r6
|
||||
movd [r6], xmm1
|
||||
mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation
|
||||
lea r6, [r6+4]
|
||||
mov [r5], r6
|
||||
|
||||
paddw xmm2, xmm7
|
||||
lea r0, [r0+8]
|
||||
sub r4, 4
|
||||
jnz near HASH_WIDTH_LOOP_SSE2
|
||||
paddw xmm3, xmm6
|
||||
dec r12
|
||||
jnz near HASH_HEIGHT_LOOP_SSE2
|
||||
paddw xmm2, xmm7
|
||||
lea r0, [r0+8]
|
||||
sub r4, 4
|
||||
jnz near HASH_WIDTH_LOOP_SSE2
|
||||
paddw xmm3, xmm6
|
||||
dec r12
|
||||
jnz near HASH_HEIGHT_LOOP_SSE2
|
||||
|
||||
pop r13
|
||||
pop r12
|
||||
pop r13
|
||||
pop r12
|
||||
POP_XMM
|
||||
ret
|
||||
ret
|
||||
|
||||
;---------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
|
||||
@ -1455,59 +1455,59 @@ WELS_EXTERN InitializeHashforFeature_sse2
|
||||
push r12
|
||||
push r13
|
||||
mov r12, r2
|
||||
sar r2, 2
|
||||
mov r5, 0 ;r5:ecx
|
||||
sar r2, 2
|
||||
mov r5, 0 ;r5:ecx
|
||||
xor r6, r6
|
||||
pxor xmm3, xmm3
|
||||
pxor xmm3, xmm3
|
||||
hash_assign_loop_x4_sse2:
|
||||
movdqa xmm0, [r0+r5]
|
||||
pslld xmm0, 2
|
||||
movdqa xmm0, [r0+r5]
|
||||
pslld xmm0, 2
|
||||
|
||||
movdqa xmm1, xmm0
|
||||
pcmpeqd xmm1, xmm3
|
||||
movmskps r6, xmm1
|
||||
cmp r6, 0x0f
|
||||
jz near hash_assign_with_copy_sse2
|
||||
movdqa xmm1, xmm0
|
||||
pcmpeqd xmm1, xmm3
|
||||
movmskps r6, xmm1
|
||||
cmp r6, 0x0f
|
||||
jz near hash_assign_with_copy_sse2
|
||||
|
||||
%assign x 0
|
||||
%rep 4
|
||||
lea r13, [r3+r5*2+x]
|
||||
mov [r13], r1
|
||||
lea r13, [r4+r5*2+x]
|
||||
mov [r13], r1
|
||||
movd r6d, xmm0
|
||||
add r1, r6
|
||||
psrldq xmm0, 4
|
||||
lea r13, [r3+r5*2+x]
|
||||
mov [r13], r1
|
||||
lea r13, [r4+r5*2+x]
|
||||
mov [r13], r1
|
||||
movd r6d, xmm0
|
||||
add r1, r6
|
||||
psrldq xmm0, 4
|
||||
%assign x x+8
|
||||
%endrep
|
||||
jmp near assign_next_sse2
|
||||
jmp near assign_next_sse2
|
||||
|
||||
hash_assign_with_copy_sse2:
|
||||
movq xmm1, r1
|
||||
pshufd xmm2, xmm1, 01000100b
|
||||
movdqa [r3+r5*2], xmm2
|
||||
movdqa [r4+r5*2], xmm2
|
||||
movdqa [r3+r5*2+16], xmm2
|
||||
movdqa [r4+r5*2+16], xmm2
|
||||
movq xmm1, r1
|
||||
pshufd xmm2, xmm1, 01000100b
|
||||
movdqa [r3+r5*2], xmm2
|
||||
movdqa [r4+r5*2], xmm2
|
||||
movdqa [r3+r5*2+16], xmm2
|
||||
movdqa [r4+r5*2+16], xmm2
|
||||
|
||||
assign_next_sse2:
|
||||
add r5, 16
|
||||
dec r2
|
||||
jnz near hash_assign_loop_x4_sse2
|
||||
add r5, 16
|
||||
dec r2
|
||||
jnz near hash_assign_loop_x4_sse2
|
||||
|
||||
and r12, 3
|
||||
jz near hash_assign_no_rem_sse2
|
||||
and r12, 3
|
||||
jz near hash_assign_no_rem_sse2
|
||||
hash_assign_loop_x4_rem_sse2:
|
||||
lea r13, [r3+r5*2]
|
||||
mov [r13], r1
|
||||
lea r13, [r4+r5*2]
|
||||
mov [r13], r1
|
||||
mov r6d, [r0+r5]
|
||||
sal r6, 2
|
||||
add r1, r6
|
||||
add r5, 4
|
||||
dec r12
|
||||
jnz near hash_assign_loop_x4_rem_sse2
|
||||
lea r13, [r3+r5*2]
|
||||
mov [r13], r1
|
||||
lea r13, [r4+r5*2]
|
||||
mov [r13], r1
|
||||
mov r6d, [r0+r5]
|
||||
sal r6, 2
|
||||
add r1, r6
|
||||
add r5, 4
|
||||
dec r12
|
||||
jnz near hash_assign_loop_x4_rem_sse2
|
||||
|
||||
hash_assign_no_rem_sse2:
|
||||
pop r13
|
||||
|
@ -242,9 +242,13 @@ delete[] pFeaturePointValueList1Buff; \
|
||||
|
||||
GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_c, 10, 10)
|
||||
GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_c, 16, 16)
|
||||
GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_c, 640, 320)
|
||||
GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_c, 640, 320)
|
||||
#ifdef X86_ASM
|
||||
GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_sse2, 10, 10)
|
||||
GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_sse2, 16, 16)
|
||||
GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_sse2, 640, 320)
|
||||
GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_sse2, 640, 320)
|
||||
#endif
|
||||
|
||||
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 1, 1)
|
||||
|
Loading…
Reference in New Issue
Block a user