refine format and add UT cases

This commit is contained in:
zhiliang wang 2014-08-15 09:22:37 +08:00
parent 76863f977a
commit ef88889404
2 changed files with 197 additions and 193 deletions

View File

@ -676,154 +676,154 @@ WIDTH_LOOP_X16_SSE4:
; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
;-----------------------------------------------------------------------------------------------------------------------------
WELS_EXTERN FillQpelLocationByFeatureValue_sse2
push esi
push edi
push ebx
push ebp
push esi
push edi
push ebx
push ebp
%define _ps 16 ; push size
%define _ls 4 ; local size
%define sum_ref esp+_ps+_ls+4
%define pos_list esp+_ps+_ls+16
%define width esp+_ps+_ls+8
%define height esp+_ps+_ls+12
%define i_height esp
sub esp, _ls
%define _ps 16 ; push size
%define _ls 4 ; local size
%define sum_ref esp+_ps+_ls+4
%define pos_list esp+_ps+_ls+16
%define width esp+_ps+_ls+8
%define height esp+_ps+_ls+12
%define i_height esp
sub esp, _ls
mov esi, [sum_ref]
mov edi, [pos_list]
mov ebp, [width]
mov ebx, [height]
mov [i_height], ebx
mov esi, [sum_ref]
mov edi, [pos_list]
mov ebp, [width]
mov ebx, [height]
mov [i_height], ebx
movq xmm7, [mv_x_inc_x4] ; x_qpel inc
movq xmm6, [mv_y_inc_x4] ; y_qpel inc
movq xmm5, [mx_x_offset_x4] ; x_qpel vector
pxor xmm4, xmm4
pxor xmm3, xmm3 ; y_qpel vector
movq xmm7, [mv_x_inc_x4] ; x_qpel inc
movq xmm6, [mv_y_inc_x4] ; y_qpel inc
movq xmm5, [mx_x_offset_x4] ; x_qpel vector
pxor xmm4, xmm4
pxor xmm3, xmm3 ; y_qpel vector
HASH_HEIGHT_LOOP_SSE2:
movdqa xmm2, xmm5 ; x_qpel vector
mov ecx, ebp
movdqa xmm2, xmm5 ; x_qpel vector
mov ecx, ebp
HASH_WIDTH_LOOP_SSE2:
movq xmm0, [esi] ; load x8 sum
punpcklwd xmm0, xmm4
movdqa xmm1, xmm2
punpcklwd xmm1, xmm3
movq xmm0, [esi] ; load x8 sum
punpcklwd xmm0, xmm4
movdqa xmm1, xmm2
punpcklwd xmm1, xmm3
%rep 3
movd edx, xmm0
lea ebx, [edi+edx*4]
mov eax, [ebx]
movd [eax], xmm1
mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation
lea eax, [eax+4]
mov [ebx], eax
psrldq xmm1, 4
psrldq xmm0, 4
movd edx, xmm0
lea ebx, [edi+edx*4]
mov eax, [ebx]
movd [eax], xmm1
mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation
lea eax, [eax+4]
mov [ebx], eax
psrldq xmm1, 4
psrldq xmm0, 4
%endrep
movd edx, xmm0
lea ebx, [edi+edx*4]
mov eax, [ebx]
movd [eax], xmm1
mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation
lea eax, [eax+4]
mov [ebx], eax
movd edx, xmm0
lea ebx, [edi+edx*4]
mov eax, [ebx]
movd [eax], xmm1
mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation
lea eax, [eax+4]
mov [ebx], eax
paddw xmm2, xmm7
lea esi, [esi+8]
sub ecx, 4
jnz near HASH_WIDTH_LOOP_SSE2
paddw xmm3, xmm6
dec dword [i_height]
jnz near HASH_HEIGHT_LOOP_SSE2
paddw xmm2, xmm7
lea esi, [esi+8]
sub ecx, 4
jnz near HASH_WIDTH_LOOP_SSE2
paddw xmm3, xmm6
dec dword [i_height]
jnz near HASH_HEIGHT_LOOP_SSE2
add esp, _ls
%undef _ps
%undef _ls
%undef sum_ref
%undef pos_list
%undef width
%undef height
%undef i_height
pop ebp
pop ebx
pop edi
pop esi
ret
add esp, _ls
%undef _ps
%undef _ls
%undef sum_ref
%undef pos_list
%undef width
%undef height
%undef i_height
pop ebp
pop ebx
pop edi
pop esi
ret
;---------------------------------------------------------------------------------------------------------------------------------------------------
; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
; uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList )
;---------------------------------------------------------------------------------------------------------------------------------------------------
WELS_EXTERN InitializeHashforFeature_sse2
push ebx
push esi
push edi
push ebp
%define _ps 16 ; push size
mov edi, [esp+_ps+16] ; pPositionOfSum
mov ebp, [esp+_ps+20] ; sum_idx_list
mov esi, [esp+_ps+4] ; pTimesOfSum
mov ebx, [esp+_ps+8] ; pBuf
mov edx, [esp+_ps+12] ; list_sz
sar edx, 2
mov ecx, 0
pxor xmm7, xmm7
push ebx
push esi
push edi
push ebp
%define _ps 16 ; push size
mov edi, [esp+_ps+16] ; pPositionOfSum
mov ebp, [esp+_ps+20] ; sum_idx_list
mov esi, [esp+_ps+4] ; pTimesOfSum
mov ebx, [esp+_ps+8] ; pBuf
mov edx, [esp+_ps+12] ; list_sz
sar edx, 2
mov ecx, 0
pxor xmm7, xmm7
hash_assign_loop_x4_sse2:
movdqa xmm0, [esi+ecx]
pslld xmm0, 2
movdqa xmm0, [esi+ecx]
pslld xmm0, 2
movdqa xmm1, xmm0
pcmpeqd xmm1, xmm7
movmskps eax, xmm1
movdqa xmm1, xmm0
pcmpeqd xmm1, xmm7
movmskps eax, xmm1
cmp eax, 0x0f
je near hash_assign_with_copy_sse2
je near hash_assign_with_copy_sse2
%assign x 0
%rep 4
lea eax, [edi+ecx+x]
mov [eax], ebx
lea eax, [ebp+ecx+x]
mov [eax], ebx
movd eax, xmm0
add ebx, eax
psrldq xmm0, 4
lea eax, [edi+ecx+x]
mov [eax], ebx
lea eax, [ebp+ecx+x]
mov [eax], ebx
movd eax, xmm0
add ebx, eax
psrldq xmm0, 4
%assign x x+4
%endrep
jmp near assign_next_sse2
jmp near assign_next_sse2
hash_assign_with_copy_sse2:
movd xmm1, ebx
pshufd xmm2, xmm1, 0
movdqa [edi+ecx], xmm2
movdqa [ebp+ecx], xmm2
movd xmm1, ebx
pshufd xmm2, xmm1, 0
movdqa [edi+ecx], xmm2
movdqa [ebp+ecx], xmm2
assign_next_sse2:
add ecx, 16
dec edx
jnz near hash_assign_loop_x4_sse2
add ecx, 16
dec edx
jnz near hash_assign_loop_x4_sse2
mov edx, [esp+_ps+12] ; list_sz
and edx, 3
jz near hash_assign_no_rem_sse2
mov edx, [esp+_ps+12] ; list_sz
and edx, 3
jz near hash_assign_no_rem_sse2
hash_assign_loop_x4_rem_sse2:
lea eax, [edi+ecx]
mov [eax], ebx
lea eax, [ebp+ecx]
mov [eax], ebx
mov eax, [esi+ecx]
sal eax, 2
add ebx, eax
add ecx, 4
dec edx
jnz near hash_assign_loop_x4_rem_sse2
lea eax, [edi+ecx]
mov [eax], ebx
lea eax, [ebp+ecx]
mov [eax], ebx
mov eax, [esi+ecx]
sal eax, 2
add ebx, eax
add ecx, 4
dec edx
jnz near hash_assign_loop_x4_rem_sse2
hash_assign_no_rem_sse2:
%undef _ps
pop ebp
pop edi
pop esi
pop ebx
ret
%undef _ps
pop ebp
pop edi
pop esi
pop ebx
ret
%else
;**********************************************************************************************************************
@ -1398,50 +1398,50 @@ WELS_EXTERN FillQpelLocationByFeatureValue_sse2
push r13
mov r12, r2
movq xmm7, [mv_x_inc_x4] ; x_qpel inc
movq xmm6, [mv_y_inc_x4] ; y_qpel inc
movq xmm5, [mx_x_offset_x4] ; x_qpel vector
pxor xmm4, xmm4
pxor xmm3, xmm3 ; y_qpel vector
movq xmm7, [mv_x_inc_x4] ; x_qpel inc
movq xmm6, [mv_y_inc_x4] ; y_qpel inc
movq xmm5, [mx_x_offset_x4] ; x_qpel vector
pxor xmm4, xmm4
pxor xmm3, xmm3 ; y_qpel vector
HASH_HEIGHT_LOOP_SSE2:
movdqa xmm2, xmm5 ; x_qpel vector
mov r4, r1
movdqa xmm2, xmm5 ; x_qpel vector
mov r4, r1
HASH_WIDTH_LOOP_SSE2:
movq xmm0, [r0] ; load x8 sum
punpcklwd xmm0, xmm4
movdqa xmm1, xmm2
punpcklwd xmm1, xmm3
movq xmm0, [r0] ; load x8 sum
punpcklwd xmm0, xmm4
movdqa xmm1, xmm2
punpcklwd xmm1, xmm3
%rep 3
movd r2d, xmm0 ;edx:r3
lea r5, [r3+r2*8] ;ebx:r5
mov r6, [r5] ;eax:r6
movd [r6], xmm1
mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation
lea r6, [r6+4]
mov [r5], r6
psrldq xmm1, 4
psrldq xmm0, 4
movd r2d, xmm0 ;edx:r3
lea r5, [r3+r2*8] ;ebx:r5
mov r6, [r5] ;eax:r6
movd [r6], xmm1
mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation
lea r6, [r6+4]
mov [r5], r6
psrldq xmm1, 4
psrldq xmm0, 4
%endrep
movd r2d, xmm0
lea r5, [r3+r2*8] ;ebx:r5
mov r6, [r5] ;eax:r6
movd [r6], xmm1
mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation
lea r6, [r6+4]
mov [r5], r6
movd r2d, xmm0
lea r5, [r3+r2*8] ;ebx:r5
mov r6, [r5] ;eax:r6
movd [r6], xmm1
mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation
lea r6, [r6+4]
mov [r5], r6
paddw xmm2, xmm7
lea r0, [r0+8]
sub r4, 4
jnz near HASH_WIDTH_LOOP_SSE2
paddw xmm3, xmm6
dec r12
jnz near HASH_HEIGHT_LOOP_SSE2
paddw xmm2, xmm7
lea r0, [r0+8]
sub r4, 4
jnz near HASH_WIDTH_LOOP_SSE2
paddw xmm3, xmm6
dec r12
jnz near HASH_HEIGHT_LOOP_SSE2
pop r13
pop r12
pop r13
pop r12
POP_XMM
ret
ret
;---------------------------------------------------------------------------------------------------------------------------------------------------
; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
@ -1455,59 +1455,59 @@ WELS_EXTERN InitializeHashforFeature_sse2
push r12
push r13
mov r12, r2
sar r2, 2
mov r5, 0 ;r5:ecx
sar r2, 2
mov r5, 0 ;r5:ecx
xor r6, r6
pxor xmm3, xmm3
pxor xmm3, xmm3
hash_assign_loop_x4_sse2:
movdqa xmm0, [r0+r5]
pslld xmm0, 2
movdqa xmm0, [r0+r5]
pslld xmm0, 2
movdqa xmm1, xmm0
pcmpeqd xmm1, xmm3
movmskps r6, xmm1
cmp r6, 0x0f
jz near hash_assign_with_copy_sse2
movdqa xmm1, xmm0
pcmpeqd xmm1, xmm3
movmskps r6, xmm1
cmp r6, 0x0f
jz near hash_assign_with_copy_sse2
%assign x 0
%rep 4
lea r13, [r3+r5*2+x]
mov [r13], r1
lea r13, [r4+r5*2+x]
mov [r13], r1
movd r6d, xmm0
add r1, r6
psrldq xmm0, 4
lea r13, [r3+r5*2+x]
mov [r13], r1
lea r13, [r4+r5*2+x]
mov [r13], r1
movd r6d, xmm0
add r1, r6
psrldq xmm0, 4
%assign x x+8
%endrep
jmp near assign_next_sse2
jmp near assign_next_sse2
hash_assign_with_copy_sse2:
movq xmm1, r1
pshufd xmm2, xmm1, 01000100b
movdqa [r3+r5*2], xmm2
movdqa [r4+r5*2], xmm2
movdqa [r3+r5*2+16], xmm2
movdqa [r4+r5*2+16], xmm2
movq xmm1, r1
pshufd xmm2, xmm1, 01000100b
movdqa [r3+r5*2], xmm2
movdqa [r4+r5*2], xmm2
movdqa [r3+r5*2+16], xmm2
movdqa [r4+r5*2+16], xmm2
assign_next_sse2:
add r5, 16
dec r2
jnz near hash_assign_loop_x4_sse2
add r5, 16
dec r2
jnz near hash_assign_loop_x4_sse2
and r12, 3
jz near hash_assign_no_rem_sse2
and r12, 3
jz near hash_assign_no_rem_sse2
hash_assign_loop_x4_rem_sse2:
lea r13, [r3+r5*2]
mov [r13], r1
lea r13, [r4+r5*2]
mov [r13], r1
mov r6d, [r0+r5]
sal r6, 2
add r1, r6
add r5, 4
dec r12
jnz near hash_assign_loop_x4_rem_sse2
lea r13, [r3+r5*2]
mov [r13], r1
lea r13, [r4+r5*2]
mov [r13], r1
mov r6d, [r0+r5]
sal r6, 2
add r1, r6
add r5, 4
dec r12
jnz near hash_assign_loop_x4_rem_sse2
hash_assign_no_rem_sse2:
pop r13

View File

@ -242,9 +242,13 @@ delete[] pFeaturePointValueList1Buff; \
GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_c, 10, 10)
GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_c, 16, 16)
GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_c, 640, 320)
GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_c, 640, 320)
#ifdef X86_ASM
GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_sse2, 10, 10)
GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_sse2, 16, 16)
GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_sse2, 640, 320)
GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_sse2, 640, 320)
#endif
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 1, 1)