From ef888894046228861825f4acdab09bf6298a6037 Mon Sep 17 00:00:00 2001 From: zhiliang wang Date: Fri, 15 Aug 2014 09:22:37 +0800 Subject: [PATCH] refine format and add UT cases --- codec/encoder/core/x86/sample_sc.asm | 386 +++++++++++++-------------- test/encoder/EncUT_SVC_me.cpp | 4 + 2 files changed, 197 insertions(+), 193 deletions(-) diff --git a/codec/encoder/core/x86/sample_sc.asm b/codec/encoder/core/x86/sample_sc.asm index e0c1df19..4ac2ac48 100644 --- a/codec/encoder/core/x86/sample_sc.asm +++ b/codec/encoder/core/x86/sample_sc.asm @@ -676,154 +676,154 @@ WIDTH_LOOP_X16_SSE4: ; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList) ;----------------------------------------------------------------------------------------------------------------------------- WELS_EXTERN FillQpelLocationByFeatureValue_sse2 - push esi - push edi - push ebx - push ebp + push esi + push edi + push ebx + push ebp - %define _ps 16 ; push size - %define _ls 4 ; local size - %define sum_ref esp+_ps+_ls+4 - %define pos_list esp+_ps+_ls+16 - %define width esp+_ps+_ls+8 - %define height esp+_ps+_ls+12 - %define i_height esp - sub esp, _ls + %define _ps 16 ; push size + %define _ls 4 ; local size + %define sum_ref esp+_ps+_ls+4 + %define pos_list esp+_ps+_ls+16 + %define width esp+_ps+_ls+8 + %define height esp+_ps+_ls+12 + %define i_height esp + sub esp, _ls - mov esi, [sum_ref] - mov edi, [pos_list] - mov ebp, [width] - mov ebx, [height] - mov [i_height], ebx + mov esi, [sum_ref] + mov edi, [pos_list] + mov ebp, [width] + mov ebx, [height] + mov [i_height], ebx - movq xmm7, [mv_x_inc_x4] ; x_qpel inc - movq xmm6, [mv_y_inc_x4] ; y_qpel inc - movq xmm5, [mx_x_offset_x4] ; x_qpel vector - pxor xmm4, xmm4 - pxor xmm3, xmm3 ; y_qpel vector + movq xmm7, [mv_x_inc_x4] ; x_qpel inc + movq xmm6, [mv_y_inc_x4] ; y_qpel inc + movq xmm5, [mx_x_offset_x4] ; x_qpel vector + pxor xmm4, xmm4 + pxor xmm3, xmm3 ; y_qpel vector HASH_HEIGHT_LOOP_SSE2: - movdqa xmm2, xmm5 ; x_qpel vector - mov ecx, ebp + movdqa xmm2, xmm5 ; x_qpel vector + mov ecx, ebp HASH_WIDTH_LOOP_SSE2: - movq xmm0, [esi] ; load x8 sum - punpcklwd xmm0, xmm4 - movdqa xmm1, xmm2 - punpcklwd xmm1, xmm3 + movq xmm0, [esi] ; load x8 sum + punpcklwd xmm0, xmm4 + movdqa xmm1, xmm2 + punpcklwd xmm1, xmm3 %rep 3 - movd edx, xmm0 - lea ebx, [edi+edx*4] - mov eax, [ebx] - movd [eax], xmm1 - mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation - lea eax, [eax+4] - mov [ebx], eax - psrldq xmm1, 4 - psrldq xmm0, 4 + movd edx, xmm0 + lea ebx, [edi+edx*4] + mov eax, [ebx] + movd [eax], xmm1 + mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation + lea eax, [eax+4] + mov [ebx], eax + psrldq xmm1, 4 + psrldq xmm0, 4 %endrep - movd edx, xmm0 - lea ebx, [edi+edx*4] - mov eax, [ebx] - movd [eax], xmm1 - mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation - lea eax, [eax+4] - mov [ebx], eax + movd edx, xmm0 + lea ebx, [edi+edx*4] + mov eax, [ebx] + movd [eax], xmm1 + mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation + lea eax, [eax+4] + mov [ebx], eax - paddw xmm2, xmm7 - lea esi, [esi+8] - sub ecx, 4 - jnz near HASH_WIDTH_LOOP_SSE2 - paddw xmm3, xmm6 - dec dword [i_height] - jnz near HASH_HEIGHT_LOOP_SSE2 + paddw xmm2, xmm7 + lea esi, [esi+8] + sub ecx, 4 + jnz near HASH_WIDTH_LOOP_SSE2 + paddw xmm3, xmm6 + dec dword [i_height] + jnz near HASH_HEIGHT_LOOP_SSE2 - add esp, _ls - %undef _ps - %undef _ls - %undef sum_ref - %undef pos_list - %undef width - %undef height - %undef i_height - pop ebp - pop ebx - pop edi - pop esi - ret + add esp, _ls + %undef _ps + %undef _ls + %undef sum_ref + %undef pos_list + %undef width + %undef height + %undef i_height + pop ebp + pop ebx + pop edi + pop esi + ret ;--------------------------------------------------------------------------------------------------------------------------------------------------- ; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, ; uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList ) ;--------------------------------------------------------------------------------------------------------------------------------------------------- WELS_EXTERN InitializeHashforFeature_sse2 - push ebx - push esi - push edi - push ebp - %define _ps 16 ; push size - mov edi, [esp+_ps+16] ; pPositionOfSum - mov ebp, [esp+_ps+20] ; sum_idx_list - mov esi, [esp+_ps+4] ; pTimesOfSum - mov ebx, [esp+_ps+8] ; pBuf - mov edx, [esp+_ps+12] ; list_sz - sar edx, 2 - mov ecx, 0 - pxor xmm7, xmm7 + push ebx + push esi + push edi + push ebp + %define _ps 16 ; push size + mov edi, [esp+_ps+16] ; pPositionOfSum + mov ebp, [esp+_ps+20] ; sum_idx_list + mov esi, [esp+_ps+4] ; pTimesOfSum + mov ebx, [esp+_ps+8] ; pBuf + mov edx, [esp+_ps+12] ; list_sz + sar edx, 2 + mov ecx, 0 + pxor xmm7, xmm7 hash_assign_loop_x4_sse2: - movdqa xmm0, [esi+ecx] - pslld xmm0, 2 + movdqa xmm0, [esi+ecx] + pslld xmm0, 2 - movdqa xmm1, xmm0 - pcmpeqd xmm1, xmm7 - movmskps eax, xmm1 + movdqa xmm1, xmm0 + pcmpeqd xmm1, xmm7 + movmskps eax, xmm1 cmp eax, 0x0f - je near hash_assign_with_copy_sse2 + je near hash_assign_with_copy_sse2 %assign x 0 %rep 4 - lea eax, [edi+ecx+x] - mov [eax], ebx - lea eax, [ebp+ecx+x] - mov [eax], ebx - movd eax, xmm0 - add ebx, eax - psrldq xmm0, 4 + lea eax, [edi+ecx+x] + mov [eax], ebx + lea eax, [ebp+ecx+x] + mov [eax], ebx + movd eax, xmm0 + add ebx, eax + psrldq xmm0, 4 %assign x x+4 %endrep - jmp near assign_next_sse2 + jmp near assign_next_sse2 hash_assign_with_copy_sse2: - movd xmm1, ebx - pshufd xmm2, xmm1, 0 - movdqa [edi+ecx], xmm2 - movdqa [ebp+ecx], xmm2 + movd xmm1, ebx + pshufd xmm2, xmm1, 0 + movdqa [edi+ecx], xmm2 + movdqa [ebp+ecx], xmm2 assign_next_sse2: - add ecx, 16 - dec edx - jnz near hash_assign_loop_x4_sse2 + add ecx, 16 + dec edx + jnz near hash_assign_loop_x4_sse2 - mov edx, [esp+_ps+12] ; list_sz - and edx, 3 - jz near hash_assign_no_rem_sse2 + mov edx, [esp+_ps+12] ; list_sz + and edx, 3 + jz near hash_assign_no_rem_sse2 hash_assign_loop_x4_rem_sse2: - lea eax, [edi+ecx] - mov [eax], ebx - lea eax, [ebp+ecx] - mov [eax], ebx - mov eax, [esi+ecx] - sal eax, 2 - add ebx, eax - add ecx, 4 - dec edx - jnz near hash_assign_loop_x4_rem_sse2 + lea eax, [edi+ecx] + mov [eax], ebx + lea eax, [ebp+ecx] + mov [eax], ebx + mov eax, [esi+ecx] + sal eax, 2 + add ebx, eax + add ecx, 4 + dec edx + jnz near hash_assign_loop_x4_rem_sse2 hash_assign_no_rem_sse2: - %undef _ps - pop ebp - pop edi - pop esi - pop ebx - ret + %undef _ps + pop ebp + pop edi + pop esi + pop ebx + ret %else ;********************************************************************************************************************** @@ -1398,50 +1398,50 @@ WELS_EXTERN FillQpelLocationByFeatureValue_sse2 push r13 mov r12, r2 - movq xmm7, [mv_x_inc_x4] ; x_qpel inc - movq xmm6, [mv_y_inc_x4] ; y_qpel inc - movq xmm5, [mx_x_offset_x4] ; x_qpel vector - pxor xmm4, xmm4 - pxor xmm3, xmm3 ; y_qpel vector + movq xmm7, [mv_x_inc_x4] ; x_qpel inc + movq xmm6, [mv_y_inc_x4] ; y_qpel inc + movq xmm5, [mx_x_offset_x4] ; x_qpel vector + pxor xmm4, xmm4 + pxor xmm3, xmm3 ; y_qpel vector HASH_HEIGHT_LOOP_SSE2: - movdqa xmm2, xmm5 ; x_qpel vector - mov r4, r1 + movdqa xmm2, xmm5 ; x_qpel vector + mov r4, r1 HASH_WIDTH_LOOP_SSE2: - movq xmm0, [r0] ; load x8 sum - punpcklwd xmm0, xmm4 - movdqa xmm1, xmm2 - punpcklwd xmm1, xmm3 + movq xmm0, [r0] ; load x8 sum + punpcklwd xmm0, xmm4 + movdqa xmm1, xmm2 + punpcklwd xmm1, xmm3 %rep 3 - movd r2d, xmm0 ;edx:r3 - lea r5, [r3+r2*8] ;ebx:r5 - mov r6, [r5] ;eax:r6 - movd [r6], xmm1 - mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation - lea r6, [r6+4] - mov [r5], r6 - psrldq xmm1, 4 - psrldq xmm0, 4 + movd r2d, xmm0 ;edx:r3 + lea r5, [r3+r2*8] ;ebx:r5 + mov r6, [r5] ;eax:r6 + movd [r6], xmm1 + mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation + lea r6, [r6+4] + mov [r5], r6 + psrldq xmm1, 4 + psrldq xmm0, 4 %endrep - movd r2d, xmm0 - lea r5, [r3+r2*8] ;ebx:r5 - mov r6, [r5] ;eax:r6 - movd [r6], xmm1 - mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation - lea r6, [r6+4] - mov [r5], r6 + movd r2d, xmm0 + lea r5, [r3+r2*8] ;ebx:r5 + mov r6, [r5] ;eax:r6 + movd [r6], xmm1 + mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation + lea r6, [r6+4] + mov [r5], r6 - paddw xmm2, xmm7 - lea r0, [r0+8] - sub r4, 4 - jnz near HASH_WIDTH_LOOP_SSE2 - paddw xmm3, xmm6 - dec r12 - jnz near HASH_HEIGHT_LOOP_SSE2 + paddw xmm2, xmm7 + lea r0, [r0+8] + sub r4, 4 + jnz near HASH_WIDTH_LOOP_SSE2 + paddw xmm3, xmm6 + dec r12 + jnz near HASH_HEIGHT_LOOP_SSE2 - pop r13 - pop r12 + pop r13 + pop r12 POP_XMM - ret + ret ;--------------------------------------------------------------------------------------------------------------------------------------------------- ; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, @@ -1455,59 +1455,59 @@ WELS_EXTERN InitializeHashforFeature_sse2 push r12 push r13 mov r12, r2 - sar r2, 2 - mov r5, 0 ;r5:ecx + sar r2, 2 + mov r5, 0 ;r5:ecx xor r6, r6 - pxor xmm3, xmm3 + pxor xmm3, xmm3 hash_assign_loop_x4_sse2: - movdqa xmm0, [r0+r5] - pslld xmm0, 2 + movdqa xmm0, [r0+r5] + pslld xmm0, 2 - movdqa xmm1, xmm0 - pcmpeqd xmm1, xmm3 - movmskps r6, xmm1 - cmp r6, 0x0f - jz near hash_assign_with_copy_sse2 + movdqa xmm1, xmm0 + pcmpeqd xmm1, xmm3 + movmskps r6, xmm1 + cmp r6, 0x0f + jz near hash_assign_with_copy_sse2 %assign x 0 %rep 4 - lea r13, [r3+r5*2+x] - mov [r13], r1 - lea r13, [r4+r5*2+x] - mov [r13], r1 - movd r6d, xmm0 - add r1, r6 - psrldq xmm0, 4 + lea r13, [r3+r5*2+x] + mov [r13], r1 + lea r13, [r4+r5*2+x] + mov [r13], r1 + movd r6d, xmm0 + add r1, r6 + psrldq xmm0, 4 %assign x x+8 %endrep - jmp near assign_next_sse2 + jmp near assign_next_sse2 hash_assign_with_copy_sse2: - movq xmm1, r1 - pshufd xmm2, xmm1, 01000100b - movdqa [r3+r5*2], xmm2 - movdqa [r4+r5*2], xmm2 - movdqa [r3+r5*2+16], xmm2 - movdqa [r4+r5*2+16], xmm2 + movq xmm1, r1 + pshufd xmm2, xmm1, 01000100b + movdqa [r3+r5*2], xmm2 + movdqa [r4+r5*2], xmm2 + movdqa [r3+r5*2+16], xmm2 + movdqa [r4+r5*2+16], xmm2 assign_next_sse2: - add r5, 16 - dec r2 - jnz near hash_assign_loop_x4_sse2 + add r5, 16 + dec r2 + jnz near hash_assign_loop_x4_sse2 - and r12, 3 - jz near hash_assign_no_rem_sse2 + and r12, 3 + jz near hash_assign_no_rem_sse2 hash_assign_loop_x4_rem_sse2: - lea r13, [r3+r5*2] - mov [r13], r1 - lea r13, [r4+r5*2] - mov [r13], r1 - mov r6d, [r0+r5] - sal r6, 2 - add r1, r6 - add r5, 4 - dec r12 - jnz near hash_assign_loop_x4_rem_sse2 + lea r13, [r3+r5*2] + mov [r13], r1 + lea r13, [r4+r5*2] + mov [r13], r1 + mov r6d, [r0+r5] + sal r6, 2 + add r1, r6 + add r5, 4 + dec r12 + jnz near hash_assign_loop_x4_rem_sse2 hash_assign_no_rem_sse2: pop r13 diff --git a/test/encoder/EncUT_SVC_me.cpp b/test/encoder/EncUT_SVC_me.cpp index 85b8c045..bf4c7edb 100644 --- a/test/encoder/EncUT_SVC_me.cpp +++ b/test/encoder/EncUT_SVC_me.cpp @@ -242,9 +242,13 @@ delete[] pFeaturePointValueList1Buff; \ GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_c, 10, 10) GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_c, 16, 16) +GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_c, 640, 320) +GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_c, 640, 320) #ifdef X86_ASM GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_sse2, 10, 10) GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_sse2, 16, 16) +GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_sse2, 640, 320) +GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_sse2, 640, 320) #endif GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 1, 1)