commit
e4a9c7f8e4
@ -100,6 +100,7 @@ BITS 64
|
||||
%define r1w dx
|
||||
%define r2w r8w
|
||||
%define r3w r9w
|
||||
%define r6w r11w
|
||||
|
||||
%define r0b cl
|
||||
%define r1b dl
|
||||
@ -149,6 +150,7 @@ BITS 64
|
||||
%define r1w si
|
||||
%define r2w dx
|
||||
%define r3w cx
|
||||
%define r6w r10w
|
||||
|
||||
%define r0b dil
|
||||
%define r1b sil
|
||||
@ -198,6 +200,7 @@ BITS 32
|
||||
%define r1w cx
|
||||
%define r2w dx
|
||||
%define r3w bx
|
||||
%define r6w bp
|
||||
|
||||
%define r0b al
|
||||
%define r1b cl
|
||||
|
@ -353,6 +353,283 @@ WELS_EXTERN WelsSampleSatd16x16_sse2
|
||||
;
|
||||
;***********************************************************************
|
||||
|
||||
|
||||
%macro SSE_DB_1_2REG 2
|
||||
pxor %1, %1
|
||||
pcmpeqw %2, %2
|
||||
psubb %1, %2
|
||||
%endmacro
|
||||
|
||||
;***********************************************************************
|
||||
;
|
||||
;int32_t WelsSampleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
|
||||
; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
|
||||
;
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsSampleSatdThree4x4_sse2
|
||||
|
||||
%ifdef X86_32
|
||||
push r3
|
||||
push r4
|
||||
push r5
|
||||
push r6
|
||||
%assign push_num 4
|
||||
%else
|
||||
%assign push_num 0
|
||||
%endif
|
||||
PUSH_XMM 8
|
||||
|
||||
mov r2, arg3
|
||||
mov r3, arg4
|
||||
SIGN_EXTENSION r3, r3d
|
||||
|
||||
; load source 4x4 samples and Hadamard transform
|
||||
movd xmm0, [r2]
|
||||
movd xmm1, [r2+r3]
|
||||
lea r2 , [r2+2*r3]
|
||||
movd xmm2, [r2]
|
||||
movd xmm3, [r2+r3]
|
||||
punpckldq xmm0, xmm2
|
||||
punpckldq xmm1, xmm3
|
||||
|
||||
pxor xmm6, xmm6
|
||||
punpcklbw xmm0, xmm6
|
||||
punpcklbw xmm1, xmm6
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
paddw xmm0, xmm1
|
||||
psubw xmm2, xmm1
|
||||
SSE2_XSawp qdq, xmm0, xmm2, xmm3
|
||||
|
||||
movdqa xmm4, xmm0
|
||||
paddw xmm0, xmm3
|
||||
psubw xmm4, xmm3
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
punpcklwd xmm0, xmm4
|
||||
punpckhwd xmm4, xmm2
|
||||
|
||||
SSE2_XSawp dq, xmm0, xmm4, xmm3
|
||||
SSE2_XSawp qdq, xmm0, xmm3, xmm5
|
||||
|
||||
movdqa xmm7, xmm0
|
||||
paddw xmm0, xmm5
|
||||
psubw xmm7, xmm5
|
||||
|
||||
SSE2_XSawp qdq, xmm0, xmm7, xmm1
|
||||
|
||||
; Hadamard transform results are saved in xmm0 and xmm2
|
||||
movdqa xmm2, xmm0
|
||||
paddw xmm0, xmm1
|
||||
psubw xmm2, xmm1
|
||||
|
||||
;load top boundary samples: [a b c d]
|
||||
mov r0, arg1
|
||||
mov r1, arg2
|
||||
SIGN_EXTENSION r1, r1d
|
||||
sub r0, r1
|
||||
%ifdef UNIX64
|
||||
push r4
|
||||
push r5
|
||||
%endif
|
||||
|
||||
movzx r2d, byte [r0]
|
||||
movzx r3d, byte [r0+1]
|
||||
movzx r4d, byte [r0+2]
|
||||
movzx r5d, byte [r0+3]
|
||||
|
||||
; get the transform results of top boundary samples: [a b c d]
|
||||
add r3d, r2d ; r3d = a + b
|
||||
add r5d, r4d ; r5d = c + d
|
||||
add r2d, r2d ; r2d = a + a
|
||||
add r4d, r4d ; r4d = c + c
|
||||
sub r2d, r3d ; r2d = a + a - a - b = a - b
|
||||
sub r4d, r5d ; r4d = c + c - c - d = c - d
|
||||
add r5d, r3d ; r5d = (a + b) + (c + d)
|
||||
add r3d, r3d
|
||||
sub r3d, r5d ; r3d = (a + b) - (c + d)
|
||||
add r4d, r2d ; r4d = (a - b) + (c - d)
|
||||
add r2d, r2d
|
||||
sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
|
||||
|
||||
movdqa xmm6, xmm0
|
||||
movdqa xmm7, xmm2
|
||||
movd xmm5, r5d ; store the edi for DC mode
|
||||
pxor xmm3, xmm3
|
||||
pxor xmm4, xmm4
|
||||
pinsrw xmm3, r5d, 0
|
||||
pinsrw xmm3, r4d, 4
|
||||
psllw xmm3, 2
|
||||
pinsrw xmm4, r3d, 0
|
||||
pinsrw xmm4, r2d, 4
|
||||
psllw xmm4, 2
|
||||
|
||||
; get the satd of H
|
||||
psubw xmm0, xmm3
|
||||
psubw xmm2, xmm4
|
||||
|
||||
WELS_AbsW xmm0, xmm1
|
||||
WELS_AbsW xmm2, xmm1
|
||||
paddusw xmm0, xmm2
|
||||
SSE2_SumWHorizon1 xmm0, xmm1 ; satd of V is stored in xmm0
|
||||
|
||||
;load left boundary samples: [a b c d]'
|
||||
add r0, r1
|
||||
|
||||
movzx r2d, byte [r0-1]
|
||||
movzx r3d, byte [r0+r1-1]
|
||||
lea r0 , [r0+2*r1]
|
||||
movzx r4d, byte [r0-1]
|
||||
movzx r5d, byte [r0+r1-1]
|
||||
|
||||
; get the transform results of left boundary samples: [a b c d]'
|
||||
add r3d, r2d ; r3d = a + b
|
||||
add r5d, r4d ; r5d = c + d
|
||||
add r2d, r2d ; r2d = a + a
|
||||
add r4d, r4d ; r4d = c + c
|
||||
sub r2d, r3d ; r2d = a + a - a - b = a - b
|
||||
sub r4d, r5d ; r4d = c + c - c - d = c - d
|
||||
add r5d, r3d ; r5d = (a + b) + (c + d)
|
||||
add r3d, r3d
|
||||
sub r3d, r5d ; r3d = (a + b) - (c + d)
|
||||
add r4d, r2d ; r4d = (a - b) + (c - d)
|
||||
add r2d, r2d
|
||||
sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
|
||||
|
||||
; store the transform results in xmm3
|
||||
movd xmm3, r5d
|
||||
pinsrw xmm3, r3d, 1
|
||||
pinsrw xmm3, r2d, 2
|
||||
pinsrw xmm3, r4d, 3
|
||||
psllw xmm3, 2
|
||||
|
||||
; get the satd of V
|
||||
movdqa xmm2, xmm6
|
||||
movdqa xmm4, xmm7
|
||||
psubw xmm2, xmm3
|
||||
WELS_AbsW xmm2, xmm1
|
||||
WELS_AbsW xmm4, xmm1
|
||||
paddusw xmm2, xmm4
|
||||
SSE2_SumWHorizon1 xmm2, xmm1 ; satd of H is stored in xmm2
|
||||
|
||||
; DC result is stored in xmm1
|
||||
add r5d, 4
|
||||
movd xmm1, r5d
|
||||
paddw xmm1, xmm5
|
||||
psrlw xmm1, 3
|
||||
movdqa xmm5, xmm1
|
||||
psllw xmm1, 4
|
||||
|
||||
; get the satd of DC
|
||||
psubw xmm6, xmm1
|
||||
WELS_AbsW xmm6, xmm1
|
||||
WELS_AbsW xmm7, xmm1
|
||||
paddusw xmm6, xmm7
|
||||
SSE2_SumWHorizon1 xmm6, xmm1 ; satd of DC is stored in xmm6
|
||||
%ifdef UNIX64
|
||||
pop r5
|
||||
pop r4
|
||||
%endif
|
||||
; comparing order: DC H V
|
||||
|
||||
mov r4, arg5
|
||||
movd r2d, xmm6
|
||||
movd r3d, xmm2
|
||||
movd r6d, xmm0
|
||||
|
||||
and r2d, 0xffff
|
||||
shr r2d, 1
|
||||
and r3d, 0xffff
|
||||
shr r3d, 1
|
||||
and r6d, 0xffff
|
||||
shr r6d, 1
|
||||
add r2d, dword arg7
|
||||
add r3d, dword arg8
|
||||
add r6d, dword arg9
|
||||
cmp r2w, r3w
|
||||
jg near not_dc
|
||||
cmp r2w, r6w
|
||||
jg near not_dc_h
|
||||
|
||||
; for DC mode
|
||||
movd r3d, xmm5
|
||||
imul r3d, 0x01010101
|
||||
movd xmm5, r3d
|
||||
pshufd xmm5, xmm5, 0
|
||||
movdqa [r4], xmm5
|
||||
mov r5, arg6
|
||||
mov dword [r5], 0x02
|
||||
mov retrd, r2d
|
||||
POP_XMM
|
||||
%ifdef X86_32
|
||||
pop r6
|
||||
pop r5
|
||||
pop r4
|
||||
pop r3
|
||||
%endif
|
||||
ret
|
||||
|
||||
not_dc:
|
||||
cmp r3w, r6w
|
||||
jg near not_dc_h
|
||||
|
||||
; for H mode
|
||||
SSE_DB_1_2REG xmm6, xmm7
|
||||
sub r0, r1
|
||||
sub r0, r1
|
||||
movzx r6d, byte [r0-1]
|
||||
movd xmm0, r6d
|
||||
pmuludq xmm0, xmm6
|
||||
|
||||
movzx r6d, byte [r0+r1-1]
|
||||
movd xmm1, r6d
|
||||
pmuludq xmm1, xmm6
|
||||
punpckldq xmm0, xmm1
|
||||
|
||||
lea r0, [r0+r1*2]
|
||||
movzx r6d, byte [r0-1]
|
||||
movd xmm2, r6d
|
||||
pmuludq xmm2, xmm6
|
||||
|
||||
movzx r6d, byte [r0+r1-1]
|
||||
movd xmm3, r6d
|
||||
pmuludq xmm3, xmm6
|
||||
punpckldq xmm2, xmm3
|
||||
punpcklqdq xmm0, xmm2
|
||||
|
||||
movdqa [r4],xmm0
|
||||
|
||||
mov retrd, r3d
|
||||
mov r5, arg6
|
||||
mov dword [r5], 0x01
|
||||
POP_XMM
|
||||
%ifdef X86_32
|
||||
pop r6
|
||||
pop r5
|
||||
pop r4
|
||||
pop r3
|
||||
%endif
|
||||
ret
|
||||
not_dc_h:
|
||||
sub r0, r1
|
||||
sub r0, r1
|
||||
sub r0, r1
|
||||
movd xmm0, [r0]
|
||||
pshufd xmm0, xmm0, 0
|
||||
movdqa [r4],xmm0
|
||||
mov retrd, r6d
|
||||
mov r5, arg6
|
||||
mov dword [r5], 0x00
|
||||
POP_XMM
|
||||
%ifdef X86_32
|
||||
pop r6
|
||||
pop r5
|
||||
pop r4
|
||||
pop r3
|
||||
%endif
|
||||
ret
|
||||
|
||||
|
||||
%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
|
||||
pmaddubsw %1, xmm5
|
||||
movdqa %2, %1
|
||||
@ -390,12 +667,12 @@ WELS_EXTERN WelsSampleSatd16x16_sse2
|
||||
|
||||
%macro SSE41_GetX38x4SatdDec 0
|
||||
pxor xmm7, xmm7
|
||||
movq xmm0, [eax]
|
||||
movq xmm1, [eax+ebx]
|
||||
lea eax, [eax+2*ebx]
|
||||
movq xmm2, [eax]
|
||||
movq xmm3, [eax+ebx]
|
||||
lea eax, [eax+2*ebx]
|
||||
movq xmm0, [r2]
|
||||
movq xmm1, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq xmm2, [r2]
|
||||
movq xmm3, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
punpcklbw xmm0, xmm7
|
||||
punpcklbw xmm1, xmm7
|
||||
punpcklbw xmm2, xmm7
|
||||
@ -405,34 +682,35 @@ WELS_EXTERN WelsSampleSatd16x16_sse2
|
||||
SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
|
||||
;doesn't need another transpose
|
||||
%endmacro
|
||||
|
||||
%macro SSE41_GetX38x4SatdV 2
|
||||
pxor xmm0, xmm0
|
||||
pinsrw xmm0, word[esi+%2], 0
|
||||
pinsrw xmm0, word[esi+%2+8], 4
|
||||
pinsrw xmm0, word[r6+%2], 0
|
||||
pinsrw xmm0, word[r6+%2+8], 4
|
||||
psubsw xmm0, xmm7
|
||||
pabsw xmm0, xmm0
|
||||
paddw xmm4, xmm0
|
||||
pxor xmm0, xmm0
|
||||
pinsrw xmm0, word[esi+%2+2], 0
|
||||
pinsrw xmm0, word[esi+%2+10], 4
|
||||
pinsrw xmm0, word[r6+%2+2], 0
|
||||
pinsrw xmm0, word[r6+%2+10], 4
|
||||
psubsw xmm0, xmm1
|
||||
pabsw xmm0, xmm0
|
||||
paddw xmm4, xmm0
|
||||
pxor xmm0, xmm0
|
||||
pinsrw xmm0, word[esi+%2+4], 0
|
||||
pinsrw xmm0, word[esi+%2+12], 4
|
||||
pinsrw xmm0, word[r6+%2+4], 0
|
||||
pinsrw xmm0, word[r6+%2+12], 4
|
||||
psubsw xmm0, xmm3
|
||||
pabsw xmm0, xmm0
|
||||
paddw xmm4, xmm0
|
||||
pxor xmm0, xmm0
|
||||
pinsrw xmm0, word[esi+%2+6], 0
|
||||
pinsrw xmm0, word[esi+%2+14], 4
|
||||
pinsrw xmm0, word[r6+%2+6], 0
|
||||
pinsrw xmm0, word[r6+%2+14], 4
|
||||
psubsw xmm0, xmm2
|
||||
pabsw xmm0, xmm0
|
||||
paddw xmm4, xmm0
|
||||
%endmacro
|
||||
%macro SSE41_GetX38x4SatdH 3
|
||||
movq xmm0, [esi+%3+8*%1]
|
||||
movq xmm0, [r6+%3+8*%1]
|
||||
punpcklqdq xmm0, xmm0
|
||||
psubsw xmm0, xmm7
|
||||
pabsw xmm0, xmm0
|
||||
@ -455,7 +733,7 @@ WELS_EXTERN WelsSampleSatd16x16_sse2
|
||||
%endmacro
|
||||
%macro SSE41_ChromaGetX38x4SatdDC 1
|
||||
shl %1, 4
|
||||
movdqa xmm0, [esi+32+%1]
|
||||
movdqa xmm0, [r6+32+%1]
|
||||
psubsw xmm0, xmm7
|
||||
pabsw xmm0, xmm0
|
||||
paddw xmm6, xmm0
|
||||
@ -481,83 +759,93 @@ WELS_EXTERN WelsSampleSatd16x16_sse2
|
||||
paddd %1, %3
|
||||
%endmacro
|
||||
|
||||
|
||||
%ifdef X86_32
|
||||
WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
mov ecx, [esp+16]
|
||||
mov edx, [esp+20]
|
||||
mov eax, [esp+24]
|
||||
mov ebx, [esp+28]
|
||||
mov esi, [esp+40] ;temp_satd
|
||||
%assign push_num 0
|
||||
LOAD_7_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
|
||||
%ifndef X86_32
|
||||
push r12
|
||||
mov r12, r2
|
||||
%endif
|
||||
|
||||
pxor xmm4, xmm4
|
||||
movdqa xmm5, [HSumSubDB1]
|
||||
movdqa xmm6, [HSumSubDW1]
|
||||
movdqa xmm7, [PDW1]
|
||||
sub ecx, edx
|
||||
movdqu xmm0, [ecx]
|
||||
sub r0, r1
|
||||
movdqu xmm0, [r0]
|
||||
movhlps xmm1, xmm0
|
||||
punpcklqdq xmm0, xmm0
|
||||
punpcklqdq xmm1, xmm1
|
||||
SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
|
||||
SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
|
||||
movdqa [esi], xmm0 ;V
|
||||
movdqa [esi+16], xmm1
|
||||
add ecx, edx
|
||||
pinsrb xmm0, byte[ecx-1], 0
|
||||
pinsrb xmm0, byte[ecx+edx-1], 1
|
||||
lea ecx, [ecx+2*edx]
|
||||
pinsrb xmm0, byte[ecx-1], 2
|
||||
pinsrb xmm0, byte[ecx+edx-1], 3
|
||||
lea ecx, [ecx+2*edx]
|
||||
pinsrb xmm0, byte[ecx-1], 4
|
||||
pinsrb xmm0, byte[ecx+edx-1], 5
|
||||
lea ecx, [ecx+2*edx]
|
||||
pinsrb xmm0, byte[ecx-1], 6
|
||||
pinsrb xmm0, byte[ecx+edx-1], 7
|
||||
lea ecx, [ecx+2*edx]
|
||||
pinsrb xmm0, byte[ecx-1], 8
|
||||
pinsrb xmm0, byte[ecx+edx-1], 9
|
||||
lea ecx, [ecx+2*edx]
|
||||
pinsrb xmm0, byte[ecx-1], 10
|
||||
pinsrb xmm0, byte[ecx+edx-1], 11
|
||||
lea ecx, [ecx+2*edx]
|
||||
pinsrb xmm0, byte[ecx-1], 12
|
||||
pinsrb xmm0, byte[ecx+edx-1], 13
|
||||
lea ecx, [ecx+2*edx]
|
||||
pinsrb xmm0, byte[ecx-1], 14
|
||||
pinsrb xmm0, byte[ecx+edx-1], 15
|
||||
movdqa [r6], xmm0 ;V
|
||||
movdqa [r6+16], xmm1
|
||||
add r0, r1
|
||||
pinsrb xmm0, byte[r0-1], 0
|
||||
pinsrb xmm0, byte[r0+r1-1], 1
|
||||
lea r0, [r0+2*r1]
|
||||
pinsrb xmm0, byte[r0-1], 2
|
||||
pinsrb xmm0, byte[r0+r1-1], 3
|
||||
lea r0, [r0+2*r1]
|
||||
pinsrb xmm0, byte[r0-1], 4
|
||||
pinsrb xmm0, byte[r0+r1-1], 5
|
||||
lea r0, [r0+2*r1]
|
||||
pinsrb xmm0, byte[r0-1], 6
|
||||
pinsrb xmm0, byte[r0+r1-1], 7
|
||||
lea r0, [r0+2*r1]
|
||||
pinsrb xmm0, byte[r0-1], 8
|
||||
pinsrb xmm0, byte[r0+r1-1], 9
|
||||
lea r0, [r0+2*r1]
|
||||
pinsrb xmm0, byte[r0-1], 10
|
||||
pinsrb xmm0, byte[r0+r1-1], 11
|
||||
lea r0, [r0+2*r1]
|
||||
pinsrb xmm0, byte[r0-1], 12
|
||||
pinsrb xmm0, byte[r0+r1-1], 13
|
||||
lea r0, [r0+2*r1]
|
||||
pinsrb xmm0, byte[r0-1], 14
|
||||
pinsrb xmm0, byte[r0+r1-1], 15
|
||||
movhlps xmm1, xmm0
|
||||
punpcklqdq xmm0, xmm0
|
||||
punpcklqdq xmm1, xmm1
|
||||
SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
|
||||
SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
|
||||
movdqa [esi+32], xmm0 ;H
|
||||
movdqa [esi+48], xmm1
|
||||
movd ecx, xmm4 ;dc
|
||||
add ecx, 16 ;(sum+16)
|
||||
shr ecx, 5 ;((sum+16)>>5)
|
||||
shl ecx, 4 ;
|
||||
movd mm4, ecx ; mm4 copy DC
|
||||
movdqa [r6+32], xmm0 ;H
|
||||
movdqa [r6+48], xmm1
|
||||
movd r0d, xmm4 ;dc
|
||||
add r0d, 16 ;(sum+16)
|
||||
shr r0d, 5 ;((sum+16)>>5)
|
||||
shl r0d, 4 ;
|
||||
movd mm4, r0d ; mm4 copy DC
|
||||
pxor xmm4, xmm4 ;V
|
||||
pxor xmm5, xmm5 ;H
|
||||
pxor xmm6, xmm6 ;DC
|
||||
mov ecx, 0
|
||||
mov edi, 0
|
||||
%ifdef UNIX64
|
||||
push r4
|
||||
%endif
|
||||
mov r0, 0
|
||||
mov r4, 0
|
||||
|
||||
.loop16x16_get_satd:
|
||||
.loopStart1:
|
||||
SSE41_I16x16GetX38x4Satd ecx, edi
|
||||
inc ecx
|
||||
cmp ecx, 4
|
||||
SSE41_I16x16GetX38x4Satd r0, r4
|
||||
inc r0
|
||||
cmp r0, 4
|
||||
jl .loopStart1
|
||||
cmp edi, 16
|
||||
cmp r4, 16
|
||||
je .loop16x16_get_satd_end
|
||||
mov eax, [esp+24]
|
||||
add eax, 8
|
||||
mov ecx, 0
|
||||
add edi, 16
|
||||
%ifdef X86_32
|
||||
mov r2, arg3
|
||||
%else
|
||||
mov r2, r12
|
||||
%endif
|
||||
add r2, 8
|
||||
mov r0, 0
|
||||
add r4, 16
|
||||
jmp .loop16x16_get_satd
|
||||
.loop16x16_get_satd_end:
|
||||
MMX_DW_1_2REG xmm0, xmm1
|
||||
@ -568,66 +856,70 @@ WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
|
||||
SSE41_HSum8W xmm5, xmm0, xmm1
|
||||
SSE41_HSum8W xmm6, xmm0, xmm1
|
||||
|
||||
%ifdef UNIX64
|
||||
pop r4
|
||||
%endif
|
||||
; comparing order: DC H V
|
||||
movd ebx, xmm6 ;DC
|
||||
movd edi, xmm5 ;H
|
||||
movd ecx, xmm4 ;V
|
||||
mov edx, [esp+36]
|
||||
shl edx, 1
|
||||
add edi, edx
|
||||
add ebx, edx
|
||||
mov edx, [esp+32]
|
||||
cmp ebx, edi
|
||||
movd r3d, xmm6 ;DC
|
||||
movd r1d, xmm5 ;H
|
||||
movd r0d, xmm4 ;V
|
||||
%ifndef X86_32
|
||||
pop r12
|
||||
%endif
|
||||
shl r5d, 1
|
||||
add r1d, r5d
|
||||
add r3d, r5d
|
||||
mov r4, arg5
|
||||
cmp r3d, r1d
|
||||
jge near not_dc_16x16
|
||||
cmp ebx, ecx
|
||||
cmp r3d, r0d
|
||||
jge near not_dc_h_16x16
|
||||
|
||||
; for DC mode
|
||||
mov dword[edx], 2;I16_PRED_DC
|
||||
mov eax, ebx
|
||||
mov dword[r4], 2;I16_PRED_DC
|
||||
mov retrd, r3d
|
||||
jmp near return_satd_intra_16x16_x3
|
||||
not_dc_16x16:
|
||||
; for H mode
|
||||
cmp edi, ecx
|
||||
cmp r1d, r0d
|
||||
jge near not_dc_h_16x16
|
||||
mov dword[edx], 1;I16_PRED_H
|
||||
mov eax, edi
|
||||
mov dword[r4], 1;I16_PRED_H
|
||||
mov retrd, r1d
|
||||
jmp near return_satd_intra_16x16_x3
|
||||
not_dc_h_16x16:
|
||||
; for V mode
|
||||
mov dword[edx], 0;I16_PRED_V
|
||||
mov eax, ecx
|
||||
mov dword[r4], 0;I16_PRED_V
|
||||
mov retrd, r0d
|
||||
return_satd_intra_16x16_x3:
|
||||
WELSEMMS
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
POP_XMM
|
||||
LOAD_7_PARA_POP
|
||||
ret
|
||||
|
||||
%macro SSE41_ChromaGetX38x8Satd 0
|
||||
movdqa xmm5, [HSumSubDB1]
|
||||
movdqa xmm6, [HSumSubDW1]
|
||||
movdqa xmm7, [PDW1]
|
||||
sub ecx, edx
|
||||
movq xmm0, [ecx]
|
||||
sub r0, r1
|
||||
movq xmm0, [r0]
|
||||
punpcklqdq xmm0, xmm0
|
||||
SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
|
||||
movdqa [esi], xmm0 ;V
|
||||
add ecx, edx
|
||||
pinsrb xmm0, byte[ecx-1], 0
|
||||
pinsrb xmm0, byte[ecx+edx-1], 1
|
||||
lea ecx, [ecx+2*edx]
|
||||
pinsrb xmm0, byte[ecx-1], 2
|
||||
pinsrb xmm0, byte[ecx+edx-1], 3
|
||||
lea ecx, [ecx+2*edx]
|
||||
pinsrb xmm0, byte[ecx-1], 4
|
||||
pinsrb xmm0, byte[ecx+edx-1], 5
|
||||
lea ecx, [ecx+2*edx]
|
||||
pinsrb xmm0, byte[ecx-1], 6
|
||||
pinsrb xmm0, byte[ecx+edx-1], 7
|
||||
movdqa [r6], xmm0 ;V
|
||||
add r0, r1
|
||||
pinsrb xmm0, byte[r0-1], 0
|
||||
pinsrb xmm0, byte[r0+r1-1], 1
|
||||
lea r0, [r0+2*r1]
|
||||
pinsrb xmm0, byte[r0-1], 2
|
||||
pinsrb xmm0, byte[r0+r1-1], 3
|
||||
lea r0, [r0+2*r1]
|
||||
pinsrb xmm0, byte[r0-1], 4
|
||||
pinsrb xmm0, byte[r0+r1-1], 5
|
||||
lea r0, [r0+2*r1]
|
||||
pinsrb xmm0, byte[r0-1], 6
|
||||
pinsrb xmm0, byte[r0+r1-1], 7
|
||||
punpcklqdq xmm0, xmm0
|
||||
SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
|
||||
movdqa [esi+16], xmm0 ;H
|
||||
movdqa [r6+16], xmm0 ;H
|
||||
;(sum+2)>>2
|
||||
movdqa xmm6, [PDQ2]
|
||||
movdqa xmm5, xmm4
|
||||
@ -647,21 +939,19 @@ ret
|
||||
punpcklqdq xmm4, xmm5
|
||||
psllq xmm4, 32
|
||||
psrlq xmm4, 32
|
||||
movdqa [esi+32], xmm4
|
||||
movdqa [r6+32], xmm4
|
||||
punpckhqdq xmm5, xmm6
|
||||
psllq xmm5, 32
|
||||
psrlq xmm5, 32
|
||||
movdqa [esi+48], xmm5
|
||||
movdqa [r6+48], xmm5
|
||||
|
||||
pxor xmm4, xmm4 ;V
|
||||
pxor xmm5, xmm5 ;H
|
||||
pxor xmm6, xmm6 ;DC
|
||||
mov ecx, 0
|
||||
loop_chroma_satdx3_cb_cr:
|
||||
SSE41_ChromaGetX38x4Satd ecx, 0
|
||||
inc ecx
|
||||
cmp ecx, 2
|
||||
jl loop_chroma_satdx3_cb_cr
|
||||
mov r0, 0
|
||||
SSE41_ChromaGetX38x4Satd r0, 0
|
||||
inc r0
|
||||
SSE41_ChromaGetX38x4Satd r0, 0
|
||||
%endmacro
|
||||
|
||||
%macro SSEReg2MMX 3
|
||||
@ -677,27 +967,22 @@ loop_chroma_satdx3_cb_cr:
|
||||
;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
|
||||
|
||||
WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
mov ecx, [esp+16]
|
||||
mov edx, [esp+20]
|
||||
mov eax, [esp+24]
|
||||
mov ebx, [esp+28]
|
||||
mov esi, [esp+40] ;temp_satd
|
||||
xor edi, edi
|
||||
%assign push_num 0
|
||||
LOAD_7_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
loop_chroma_satdx3:
|
||||
SSE41_ChromaGetX38x8Satd
|
||||
cmp edi, 1
|
||||
je loop_chroma_satdx3end
|
||||
inc edi
|
||||
SSEReg2MMX xmm4, mm0,mm1
|
||||
SSEReg2MMX xmm5, mm2,mm3
|
||||
SSEReg2MMX xmm6, mm5,mm6
|
||||
mov ecx, [esp+44]
|
||||
mov eax, [esp+48]
|
||||
jmp loop_chroma_satdx3
|
||||
loop_chroma_satdx3end:
|
||||
mov r0, arg8
|
||||
mov r2, arg9
|
||||
|
||||
SSE41_ChromaGetX38x8Satd
|
||||
|
||||
MMXReg2SSE xmm0, xmm3, mm0, mm1
|
||||
MMXReg2SSE xmm1, xmm3, mm2, mm3
|
||||
MMXReg2SSE xmm2, xmm3, mm5, mm6
|
||||
@ -714,39 +999,38 @@ loop_chroma_satdx3end:
|
||||
SSE41_HSum8W xmm5, xmm0, xmm1
|
||||
SSE41_HSum8W xmm6, xmm0, xmm1
|
||||
; comparing order: DC H V
|
||||
movd ebx, xmm6 ;DC
|
||||
movd edi, xmm5 ;H
|
||||
movd ecx, xmm4 ;V
|
||||
mov edx, [esp+36]
|
||||
shl edx, 1
|
||||
add edi, edx
|
||||
add ecx, edx
|
||||
mov edx, [esp+32]
|
||||
cmp ebx, edi
|
||||
movd r3d, xmm6 ;DC
|
||||
movd r1d, xmm5 ;H
|
||||
movd r0d, xmm4 ;V
|
||||
|
||||
|
||||
shl r5d, 1
|
||||
add r1d, r5d
|
||||
add r0d, r5d
|
||||
cmp r3d, r1d
|
||||
jge near not_dc_8x8
|
||||
cmp ebx, ecx
|
||||
cmp r3d, r0d
|
||||
jge near not_dc_h_8x8
|
||||
|
||||
; for DC mode
|
||||
mov dword[edx], 0;I8_PRED_DC
|
||||
mov eax, ebx
|
||||
mov dword[r4], 0;I8_PRED_DC
|
||||
mov retrd, r3d
|
||||
jmp near return_satd_intra_8x8_x3
|
||||
not_dc_8x8:
|
||||
; for H mode
|
||||
cmp edi, ecx
|
||||
cmp r1d, r0d
|
||||
jge near not_dc_h_8x8
|
||||
mov dword[edx], 1;I8_PRED_H
|
||||
mov eax, edi
|
||||
mov dword[r4], 1;I8_PRED_H
|
||||
mov retrd, r1d
|
||||
jmp near return_satd_intra_8x8_x3
|
||||
not_dc_h_8x8:
|
||||
; for V mode
|
||||
mov dword[edx], 2;I8_PRED_V
|
||||
mov eax, ecx
|
||||
mov dword[r4], 2;I8_PRED_V
|
||||
mov retrd, r0d
|
||||
return_satd_intra_8x8_x3:
|
||||
WELSEMMS
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
POP_XMM
|
||||
LOAD_7_PARA_POP
|
||||
ret
|
||||
|
||||
|
||||
@ -769,9 +1053,9 @@ ret
|
||||
paddw xmm3,xmm6
|
||||
%endmacro
|
||||
%macro WelsAddDCValue 4
|
||||
movzx %2, byte %1
|
||||
mov %3, %2
|
||||
add %4, %2
|
||||
movzx %2, byte %1
|
||||
mov %3, %2
|
||||
add %4, %2
|
||||
%endmacro
|
||||
|
||||
;***********************************************************************
|
||||
@ -780,133 +1064,139 @@ ret
|
||||
;
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
mov ecx, [esp+16]
|
||||
mov edx, [esp+20]
|
||||
mov edi, [esp+40] ;temp_sad
|
||||
sub ecx, edx
|
||||
movdqa xmm5,[ecx]
|
||||
pxor xmm0,xmm0
|
||||
psadbw xmm0,xmm5
|
||||
movhlps xmm1,xmm0
|
||||
paddw xmm0,xmm1
|
||||
movd eax,xmm0
|
||||
%assign push_num 0
|
||||
LOAD_7_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
|
||||
add ecx,edx
|
||||
lea ebx, [edx+2*edx]
|
||||
WelsAddDCValue [ecx-1 ], esi, [edi ], eax
|
||||
WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
|
||||
WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
|
||||
WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
|
||||
lea ecx, [ecx+4*edx]
|
||||
add edi, 64
|
||||
WelsAddDCValue [ecx-1 ], esi, [edi ], eax
|
||||
WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
|
||||
WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
|
||||
WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
|
||||
lea ecx, [ecx+4*edx]
|
||||
add edi, 64
|
||||
WelsAddDCValue [ecx-1 ], esi, [edi ], eax
|
||||
WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
|
||||
WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
|
||||
WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
|
||||
lea ecx, [ecx+4*edx]
|
||||
add edi, 64
|
||||
WelsAddDCValue [ecx-1 ], esi, [edi ], eax
|
||||
WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
|
||||
WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
|
||||
WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
|
||||
sub edi, 192
|
||||
add eax,10h
|
||||
shr eax,5
|
||||
movd xmm7,eax
|
||||
pxor xmm1,xmm1
|
||||
pshufb xmm7,xmm1
|
||||
pxor xmm4,xmm4
|
||||
pxor xmm3,xmm3
|
||||
pxor xmm2,xmm2
|
||||
;sad begin
|
||||
mov eax, [esp+24]
|
||||
mov ebx, [esp+28]
|
||||
lea esi, [ebx+2*ebx]
|
||||
SSSE3_Get16BSadHVDC [edi], [eax]
|
||||
SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
|
||||
SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
|
||||
SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
|
||||
add edi, 64
|
||||
lea eax, [eax+4*ebx]
|
||||
SSSE3_Get16BSadHVDC [edi], [eax]
|
||||
SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
|
||||
SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
|
||||
SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
|
||||
add edi, 64
|
||||
lea eax, [eax+4*ebx]
|
||||
SSSE3_Get16BSadHVDC [edi], [eax]
|
||||
SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
|
||||
SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
|
||||
SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
|
||||
add edi, 64
|
||||
lea eax, [eax+4*ebx]
|
||||
SSSE3_Get16BSadHVDC [edi], [eax]
|
||||
SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
|
||||
SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
|
||||
SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
|
||||
push r5
|
||||
push r4
|
||||
push r3
|
||||
|
||||
pslldq xmm3,4
|
||||
por xmm3,xmm2
|
||||
movhlps xmm1,xmm3
|
||||
paddw xmm3,xmm1
|
||||
movhlps xmm0,xmm4
|
||||
paddw xmm4,xmm0
|
||||
; comparing order: DC H V
|
||||
movd ebx, xmm4 ;DC
|
||||
movd ecx, xmm3 ;V
|
||||
sub r0, r1
|
||||
movdqa xmm5,[r0]
|
||||
pxor xmm0,xmm0
|
||||
psadbw xmm0,xmm5
|
||||
movhlps xmm1,xmm0
|
||||
paddw xmm0,xmm1
|
||||
movd r5d, xmm0
|
||||
|
||||
add r0,r1
|
||||
lea r3,[r1+2*r1] ;ebx r3
|
||||
WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d ; esi r4d, eax r5d
|
||||
WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
|
||||
WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
|
||||
WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
|
||||
lea r0, [r0+4*r1]
|
||||
add r6, 64
|
||||
WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
|
||||
WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
|
||||
WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
|
||||
WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
|
||||
lea r0, [r0+4*r1]
|
||||
add r6, 64
|
||||
WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
|
||||
WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
|
||||
WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
|
||||
WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
|
||||
lea r0, [r0+4*r1]
|
||||
add r6, 64
|
||||
WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
|
||||
WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
|
||||
WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
|
||||
WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
|
||||
sub r6, 192
|
||||
add r5d,10h
|
||||
shr r5d,5
|
||||
movd xmm7,r5d
|
||||
pxor xmm1,xmm1
|
||||
pshufb xmm7,xmm1
|
||||
pxor xmm4,xmm4
|
||||
pxor xmm3,xmm3
|
||||
pxor xmm2,xmm2
|
||||
;sad begin
|
||||
pop r3
|
||||
lea r4, [r3+2*r3] ;esi r4
|
||||
SSSE3_Get16BSadHVDC [r6], [r2]
|
||||
SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
|
||||
SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
|
||||
SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
|
||||
add r6, 64
|
||||
lea r2, [r2+4*r3]
|
||||
SSSE3_Get16BSadHVDC [r6], [r2]
|
||||
SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
|
||||
SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
|
||||
SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
|
||||
add r6, 64
|
||||
lea r2, [r2+4*r3]
|
||||
SSSE3_Get16BSadHVDC [r6], [r2]
|
||||
SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
|
||||
SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
|
||||
SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
|
||||
add r6, 64
|
||||
lea r2, [r2+4*r3]
|
||||
SSSE3_Get16BSadHVDC [r6], [r2]
|
||||
SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
|
||||
SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
|
||||
SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
|
||||
|
||||
pop r4
|
||||
pop r5
|
||||
pslldq xmm3,4
|
||||
por xmm3,xmm2
|
||||
movhlps xmm1,xmm3
|
||||
paddw xmm3,xmm1
|
||||
movhlps xmm0,xmm4
|
||||
paddw xmm4,xmm0
|
||||
; comparing order: DC H V
|
||||
movd r1d, xmm4 ;DC ;ebx r1d
|
||||
movd r0d, xmm3 ;V ;ecx r0d
|
||||
psrldq xmm3, 4
|
||||
movd esi, xmm3 ;H
|
||||
mov eax, [esp+36] ;lamda
|
||||
shl eax, 1
|
||||
add esi, eax
|
||||
add ebx, eax
|
||||
mov edx, [esp+32]
|
||||
cmp ebx, esi
|
||||
movd r2d, xmm3 ;H ;esi r2d
|
||||
|
||||
;mov eax, [esp+36] ;lamda ;eax r5
|
||||
shl r5d, 1
|
||||
add r2d, r5d
|
||||
add r1d, r5d
|
||||
;mov edx, [esp+32] ;edx r4
|
||||
cmp r1d, r2d
|
||||
jge near not_dc_16x16_sad
|
||||
cmp ebx, ecx
|
||||
cmp r1d, r0d
|
||||
jge near not_dc_h_16x16_sad
|
||||
; for DC mode
|
||||
mov dword[edx], 2;I16_PRED_DC
|
||||
mov eax, ebx
|
||||
sub edi, 192
|
||||
mov dword[r4], 2;I16_PRED_DC
|
||||
mov retrd, r1d
|
||||
sub r6, 192
|
||||
%assign x 0
|
||||
%rep 16
|
||||
movdqa [edi+16*x], xmm7
|
||||
movdqa [r6+16*x], xmm7
|
||||
%assign x x+1
|
||||
%endrep
|
||||
jmp near return_sad_intra_16x16_x3
|
||||
not_dc_16x16_sad:
|
||||
; for H mode
|
||||
cmp esi, ecx
|
||||
cmp r2d, r0d
|
||||
jge near not_dc_h_16x16_sad
|
||||
mov dword[edx], 1;I16_PRED_H
|
||||
mov eax, esi
|
||||
mov dword[r4], 1;I16_PRED_H
|
||||
mov retrd, r2d
|
||||
jmp near return_sad_intra_16x16_x3
|
||||
not_dc_h_16x16_sad:
|
||||
; for V mode
|
||||
mov dword[edx], 0;I16_PRED_V
|
||||
mov eax, ecx
|
||||
sub edi, 192
|
||||
mov dword[r4], 0;I16_PRED_V
|
||||
mov retrd, r0d
|
||||
sub r6, 192
|
||||
%assign x 0
|
||||
%rep 16
|
||||
movdqa [edi+16*x], xmm5
|
||||
movdqa [r6+16*x], xmm5
|
||||
%assign x x+1
|
||||
%endrep
|
||||
return_sad_intra_16x16_x3:
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
POP_XMM
|
||||
LOAD_7_PARA_POP
|
||||
ret
|
||||
%endif
|
||||
|
||||
;***********************************************************************
|
||||
;
|
||||
;Pixel_sad_intra_ssse3 END
|
||||
|
@ -47,6 +47,11 @@ int32_t WelsSampleSatd8x8_c (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
//int32_t WelsSampleSatd4x8( uint8_t *, int32_t, uint8_t *, int32_t );
|
||||
int32_t WelsSampleSatd4x4_c (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
|
||||
int32_t WelsSampleSatdIntra4x4Combined3_c (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t, int32_t, int32_t);
|
||||
int32_t WelsSampleSatdIntra16x16Combined3_c (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
|
||||
int32_t WelsSampleSadIntra16x16Combined3_c (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
|
||||
int32_t WelsSampleSatdIntra8x8Combined3_c (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*,
|
||||
uint8_t*, uint8_t*);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
|
@ -367,11 +367,11 @@ void WelsInitSampleSadFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_sse2;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_sse2;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_sse2;
|
||||
//pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = WelsSampleSatdThree4x4_sse2;
|
||||
pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = WelsSampleSatdThree4x4_sse2;
|
||||
}
|
||||
|
||||
if (uiCpuFlag & WELS_CPU_SSSE3) {
|
||||
//pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_ssse3;
|
||||
pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_ssse3;
|
||||
}
|
||||
|
||||
if (uiCpuFlag & WELS_CPU_SSE41) {
|
||||
@ -380,8 +380,8 @@ void WelsInitSampleSadFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16] = WelsSampleSatd8x16_sse41;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8] = WelsSampleSatd8x8_sse41;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4] = WelsSampleSatd4x4_sse41;
|
||||
//pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_sse41;
|
||||
//pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntraChroma8x8Combined3Satd_sse41;
|
||||
pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_sse41;
|
||||
pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntraChroma8x8Combined3Satd_sse41;
|
||||
}
|
||||
|
||||
#endif //(X86_ASM)
|
||||
|
@ -1166,251 +1166,4 @@ WELS_EXTERN WelsI16x16LumaPredDc_sse2
|
||||
|
||||
pop r4
|
||||
pop r3
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
;
|
||||
;int32_t WelsSampleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
|
||||
; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
|
||||
;
|
||||
;***********************************************************************
|
||||
%ifdef X86_32
|
||||
WELS_EXTERN WelsSampleSatdThree4x4_sse2
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
mov eax, [esp+24];p_enc
|
||||
mov ebx, [esp+28];linesize_enc
|
||||
|
||||
; load source 4x4 samples and Hadamard transform
|
||||
movd xmm0, [eax]
|
||||
movd xmm1, [eax+ebx]
|
||||
lea eax , [eax+2*ebx]
|
||||
movd xmm2, [eax]
|
||||
movd xmm3, [eax+ebx]
|
||||
punpckldq xmm0, xmm2
|
||||
punpckldq xmm1, xmm3
|
||||
|
||||
pxor xmm6, xmm6
|
||||
punpcklbw xmm0, xmm6
|
||||
punpcklbw xmm1, xmm6
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
paddw xmm0, xmm1
|
||||
psubw xmm2, xmm1
|
||||
SSE2_XSawp qdq, xmm0, xmm2, xmm3
|
||||
|
||||
movdqa xmm4, xmm0
|
||||
paddw xmm0, xmm3
|
||||
psubw xmm4, xmm3
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
punpcklwd xmm0, xmm4
|
||||
punpckhwd xmm4, xmm2
|
||||
|
||||
SSE2_XSawp dq, xmm0, xmm4, xmm3
|
||||
SSE2_XSawp qdq, xmm0, xmm3, xmm5
|
||||
|
||||
movdqa xmm7, xmm0
|
||||
paddw xmm0, xmm5
|
||||
psubw xmm7, xmm5
|
||||
|
||||
SSE2_XSawp qdq, xmm0, xmm7, xmm1
|
||||
|
||||
; Hadamard transform results are saved in xmm0 and xmm2
|
||||
movdqa xmm2, xmm0
|
||||
paddw xmm0, xmm1
|
||||
psubw xmm2, xmm1
|
||||
|
||||
; load top boundary samples: [a b c d]
|
||||
mov eax, [esp+16];p_dec
|
||||
sub eax, [esp+20];linesize_dec
|
||||
movzx ecx, byte [eax]
|
||||
movzx edx, byte [eax+1]
|
||||
movzx esi, byte [eax+2]
|
||||
movzx edi, byte [eax+3]
|
||||
|
||||
; get the transform results of top boundary samples: [a b c d]
|
||||
add edx, ecx ; edx = a + b
|
||||
add edi, esi ; edi = c + d
|
||||
add ecx, ecx ; ecx = a + a
|
||||
add esi, esi ; esi = c + c
|
||||
sub ecx, edx ; ecx = a + a - a - b = a - b
|
||||
sub esi, edi ; esi = c + c - c - d = c - d
|
||||
add edi, edx ; edi = (a + b) + (c + d)
|
||||
add edx, edx
|
||||
sub edx, edi ; edx = (a + b) - (c + d)
|
||||
add esi, ecx ; esi = (a - b) + (c - d)
|
||||
add ecx, ecx
|
||||
sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]
|
||||
|
||||
movdqa xmm6, xmm0
|
||||
movdqa xmm7, xmm2
|
||||
movd xmm5, edi ; store the edi for DC mode
|
||||
pxor xmm3, xmm3
|
||||
pxor xmm4, xmm4
|
||||
pinsrw xmm3, edi, 0
|
||||
pinsrw xmm3, esi, 4
|
||||
psllw xmm3, 2
|
||||
pinsrw xmm4, edx, 0
|
||||
pinsrw xmm4, ecx, 4
|
||||
psllw xmm4, 2
|
||||
|
||||
; get the satd of H
|
||||
psubw xmm0, xmm3
|
||||
psubw xmm2, xmm4
|
||||
|
||||
WELS_AbsW xmm0, xmm1
|
||||
WELS_AbsW xmm2, xmm1
|
||||
paddusw xmm0, xmm2
|
||||
SUMW_HORIZON1 xmm0, xmm1 ; satd of V is stored in xmm0
|
||||
|
||||
; load left boundary samples: [a b c d]'
|
||||
mov eax, [esp+16]
|
||||
mov ebx, [esp+20]
|
||||
movzx ecx, byte [eax-1]
|
||||
movzx edx, byte [eax+ebx-1]
|
||||
lea eax , [eax+2*ebx]
|
||||
movzx esi, byte [eax-1]
|
||||
movzx edi, byte [eax+ebx-1]
|
||||
|
||||
; get the transform results of left boundary samples: [a b c d]'
|
||||
add edx, ecx ; edx = a + b
|
||||
add edi, esi ; edi = c + d
|
||||
add ecx, ecx ; ecx = a + a
|
||||
add esi, esi ; esi = c + c
|
||||
sub ecx, edx ; ecx = a + a - a - b = a - b
|
||||
sub esi, edi ; esi = c + c - c - d = c - d
|
||||
add edi, edx ; edi = (a + b) + (c + d)
|
||||
add edx, edx
|
||||
sub edx, edi ; edx = (a + b) - (c + d)
|
||||
add esi, ecx ; esi = (a - b) + (c - d)
|
||||
add ecx, ecx
|
||||
sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]'
|
||||
|
||||
; store the transform results in xmm3
|
||||
movd xmm3, edi
|
||||
pinsrw xmm3, edx, 1
|
||||
pinsrw xmm3, ecx, 2
|
||||
pinsrw xmm3, esi, 3
|
||||
psllw xmm3, 2
|
||||
|
||||
; get the satd of V
|
||||
movdqa xmm2, xmm6
|
||||
movdqa xmm4, xmm7
|
||||
psubw xmm2, xmm3
|
||||
WELS_AbsW xmm2, xmm1
|
||||
WELS_AbsW xmm4, xmm1
|
||||
paddusw xmm2, xmm4
|
||||
SUMW_HORIZON1 xmm2, xmm1 ; satd of H is stored in xmm2
|
||||
|
||||
; DC result is stored in xmm1
|
||||
add edi, 4
|
||||
movd xmm1, edi
|
||||
paddw xmm1, xmm5
|
||||
psrlw xmm1, 3
|
||||
movdqa xmm5, xmm1
|
||||
psllw xmm1, 4
|
||||
|
||||
; get the satd of DC
|
||||
psubw xmm6, xmm1
|
||||
WELS_AbsW xmm6, xmm1
|
||||
WELS_AbsW xmm7, xmm1
|
||||
paddusw xmm6, xmm7
|
||||
SUMW_HORIZON1 xmm6, xmm1 ; satd of DC is stored in xmm6
|
||||
|
||||
; comparing order: DC H V
|
||||
mov edx, [esp+32]
|
||||
movd eax, xmm6
|
||||
movd edi, xmm2
|
||||
movd esi, xmm0
|
||||
and eax, 0xffff
|
||||
shr eax, 1
|
||||
and edi, 0xffff
|
||||
shr edi, 1
|
||||
and esi, 0xffff
|
||||
shr esi, 1
|
||||
add eax, [esp+40]
|
||||
add edi, [esp+44]
|
||||
add esi, [esp+48]
|
||||
cmp ax, di
|
||||
jg near not_dc
|
||||
cmp ax, si
|
||||
jg near not_dc_h
|
||||
|
||||
; for DC mode
|
||||
movd ebx, xmm5
|
||||
imul ebx, 0x01010101
|
||||
movd xmm5, ebx
|
||||
pshufd xmm5, xmm5, 0
|
||||
movdqa [edx], xmm5
|
||||
mov ebx, [esp+36]
|
||||
mov dword [ebx], 0x02
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
ret
|
||||
|
||||
not_dc:
|
||||
cmp di, si
|
||||
jg near not_dc_h
|
||||
|
||||
; for H mode
|
||||
SSE_DB_1_2REG xmm6, xmm7
|
||||
mov eax, [esp+16]
|
||||
mov ebx, [esp+20]
|
||||
movzx ecx, byte [eax-1]
|
||||
movd xmm0, ecx
|
||||
pmuludq xmm0, xmm6
|
||||
|
||||
movzx ecx, byte [eax+ebx-1]
|
||||
movd xmm1, ecx
|
||||
pmuludq xmm1, xmm6
|
||||
%if 1
|
||||
punpckldq xmm0, xmm1
|
||||
%else
|
||||
unpcklps xmm0, xmm1
|
||||
%endif
|
||||
lea eax, [eax+ebx*2]
|
||||
movzx ecx, byte [eax-1]
|
||||
movd xmm2, ecx
|
||||
pmuludq xmm2, xmm6
|
||||
|
||||
movzx ecx, byte [eax+ebx-1]
|
||||
movd xmm3, ecx
|
||||
pmuludq xmm3, xmm6
|
||||
%if 1
|
||||
punpckldq xmm2, xmm3
|
||||
punpcklqdq xmm0, xmm2
|
||||
%else
|
||||
unpcklps xmm2, xmm3
|
||||
unpcklpd xmm0, xmm2
|
||||
%endif
|
||||
movdqa [edx],xmm0
|
||||
|
||||
mov eax, edi
|
||||
mov ebx, [esp+36]
|
||||
mov dword [ebx], 0x01
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
ret
|
||||
not_dc_h:
|
||||
; for V mode
|
||||
mov eax, [esp+16]
|
||||
sub eax, [esp+20]
|
||||
movd xmm0, [eax]
|
||||
pshufd xmm0, xmm0, 0
|
||||
movdqa [edx],xmm0
|
||||
|
||||
mov eax, esi
|
||||
mov ebx, [esp+36]
|
||||
mov dword [ebx], 0x00
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
ret
|
||||
%endif
|
||||
|
||||
ret
|
@ -9,7 +9,136 @@
|
||||
#include "sad_common.h"
|
||||
|
||||
using namespace WelsSVCEnc;
|
||||
#ifdef X86_ASM
|
||||
TEST(IntraSadSatdFuncTest, WelsIntra16x16Combined3Sad_ssse3){
|
||||
const int32_t iLineSizeDec = 32;
|
||||
const int32_t iLineSizeEnc = 32;
|
||||
int32_t tmpa, tmpb;
|
||||
int32_t iBestMode_c, iBestMode_a, iLambda = 50;
|
||||
CMemoryAlign cMemoryAlign(0);
|
||||
int32_t iCpuCores = 0;
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect(&iCpuCores);
|
||||
if (0 == (m_uiCpuFeatureFlag & WELS_CPU_SSSE3))
|
||||
return;
|
||||
uint8_t* pDec = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeDec<<5,"pDec");
|
||||
uint8_t* pEnc = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeEnc<<5,"pEnc");
|
||||
uint8_t* pDst = (uint8_t *)cMemoryAlign.WelsMalloc(512,"pDst");
|
||||
srand((uint32_t)time(NULL));
|
||||
for(int i=0;i<(iLineSizeDec<<5);i++)
|
||||
pDec[i]=rand()%256;
|
||||
for(int i=0;i<(iLineSizeEnc<<5);i++)
|
||||
pEnc[i]=rand()%256;
|
||||
|
||||
for(int i=0;i<512;i++)
|
||||
pDst[i]=rand()%256;
|
||||
tmpa = WelsSampleSadIntra16x16Combined3_c(pDec+128, iLineSizeDec, pEnc,iLineSizeEnc,&iBestMode_c, iLambda, pDst);
|
||||
tmpb = WelsIntra16x16Combined3Sad_ssse3(pDec+128, iLineSizeDec, pEnc,iLineSizeEnc,&iBestMode_a, iLambda, pDst);
|
||||
|
||||
ASSERT_EQ(tmpa, tmpb);
|
||||
ASSERT_EQ(iBestMode_c, iBestMode_a);
|
||||
|
||||
cMemoryAlign.WelsFree(pDec,"pDec");
|
||||
cMemoryAlign.WelsFree(pEnc,"pEnc");
|
||||
cMemoryAlign.WelsFree(pDst,"pDst");
|
||||
}
|
||||
|
||||
TEST(IntraSadSatdFuncTest, WelsIntra16x16Combined3Satd_sse41){
|
||||
const int32_t iLineSizeDec = 32;
|
||||
const int32_t iLineSizeEnc = 32;
|
||||
int32_t tmpa, tmpb;
|
||||
int32_t iBestMode_c, iBestMode_a, iLambda = 50;
|
||||
CMemoryAlign cMemoryAlign(0);
|
||||
int32_t iCpuCores = 0;
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect(&iCpuCores);
|
||||
if (0 == (m_uiCpuFeatureFlag & WELS_CPU_SSE41))
|
||||
return;
|
||||
uint8_t* pDec = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeDec<<5,"pDec");
|
||||
uint8_t* pEnc = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeEnc<<5,"pEnc");
|
||||
uint8_t* pDst = (uint8_t *)cMemoryAlign.WelsMalloc(512,"pDst");
|
||||
srand((uint32_t)time(NULL));
|
||||
for(int i=0;i<(iLineSizeDec<<5);i++)
|
||||
pDec[i]=rand()%256;
|
||||
for(int i=0;i<(iLineSizeEnc<<5);i++)
|
||||
pEnc[i]=rand()%256;
|
||||
for(int i=0;i<512;i++)
|
||||
pDst[i]=rand()%256;
|
||||
tmpa = WelsSampleSatdIntra16x16Combined3_c(pDec+128, iLineSizeDec, pEnc,iLineSizeEnc,&iBestMode_c, iLambda, pDst);
|
||||
tmpb = WelsIntra16x16Combined3Satd_sse41(pDec+128, iLineSizeDec, pEnc,iLineSizeEnc,&iBestMode_a, iLambda, pDst);
|
||||
ASSERT_EQ(tmpa, tmpb);
|
||||
ASSERT_EQ(iBestMode_c, iBestMode_a);
|
||||
cMemoryAlign.WelsFree(pDec,"pDec");
|
||||
cMemoryAlign.WelsFree(pEnc,"pEnc");
|
||||
cMemoryAlign.WelsFree(pDst,"pDst");
|
||||
}
|
||||
|
||||
TEST(IntraSadSatdFuncTest, WelsSampleSatdThree4x4_sse2){
|
||||
const int32_t iLineSizeDec = 32;
|
||||
const int32_t iLineSizeEnc = 32;
|
||||
int32_t tmpa, tmpb;
|
||||
int32_t iBestMode_c, iBestMode_a, iLambda = 50;
|
||||
int32_t lambda[2] = {iLambda << 2, iLambda};
|
||||
int32_t iPredMode = rand()%3;
|
||||
CMemoryAlign cMemoryAlign(0);
|
||||
int32_t iCpuCores = 0;
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect(&iCpuCores);
|
||||
if (0 == (m_uiCpuFeatureFlag & WELS_CPU_SSE2))
|
||||
return;
|
||||
uint8_t* pDec = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeDec<<5,"pDec");
|
||||
uint8_t* pEnc = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeEnc<<5,"pEnc");
|
||||
uint8_t* pDst = (uint8_t *)cMemoryAlign.WelsMalloc(512,"pDst");
|
||||
srand((uint32_t)time(NULL));
|
||||
for(int i=0;i<(iLineSizeDec<<5);i++)
|
||||
pDec[i]=rand()%256;
|
||||
for(int i=0;i<(iLineSizeEnc<<5);i++)
|
||||
pEnc[i]=rand()%256;
|
||||
for(int i=0;i<512;i++)
|
||||
pDst[i]=rand()%256;
|
||||
tmpa = WelsSampleSatdIntra4x4Combined3_c(pDec+128, iLineSizeDec, pEnc,iLineSizeEnc, pDst, &iBestMode_c, lambda[iPredMode == 2], lambda[iPredMode == 1], lambda[iPredMode == 0]);
|
||||
tmpb = WelsSampleSatdThree4x4_sse2(pDec+128, iLineSizeDec, pEnc,iLineSizeEnc, pDst, &iBestMode_a, lambda[iPredMode == 2], lambda[iPredMode == 1], lambda[iPredMode == 0]);
|
||||
ASSERT_EQ(tmpa, tmpb);
|
||||
ASSERT_EQ(iBestMode_c, iBestMode_a);
|
||||
cMemoryAlign.WelsFree(pDec,"pDec");
|
||||
cMemoryAlign.WelsFree(pEnc,"pEnc");
|
||||
cMemoryAlign.WelsFree(pDst,"pDst");
|
||||
}
|
||||
|
||||
TEST(IntraSadSatdFuncTest, WelsIntraChroma8x8Combined3Satd_sse41){
|
||||
const int32_t iLineSizeDec = 32;
|
||||
const int32_t iLineSizeEnc = 32;
|
||||
int32_t tmpa, tmpb;
|
||||
int32_t iBestMode_c, iBestMode_a, iLambda = 50;
|
||||
CMemoryAlign cMemoryAlign(0);
|
||||
int32_t iCpuCores = 0;
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect(&iCpuCores);
|
||||
if (0 == (m_uiCpuFeatureFlag & WELS_CPU_SSE41))
|
||||
return;
|
||||
uint8_t* pDecCb = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeDec<<5,"pDecCb");
|
||||
uint8_t* pEncCb = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeEnc<<5,"pEncCb");
|
||||
uint8_t* pDecCr = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeDec<<5,"pDecCr");
|
||||
uint8_t* pEncCr = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeEnc<<5,"pEncCr");
|
||||
uint8_t* pDstChma = (uint8_t *)cMemoryAlign.WelsMalloc(512,"pDstChma");
|
||||
srand((uint32_t)time(NULL));
|
||||
for(int i=0;i<(iLineSizeDec<<5);i++){
|
||||
pDecCb[i]=rand()%256;
|
||||
pDecCr[i]=rand()%256;
|
||||
}
|
||||
for(int i=0;i<(iLineSizeEnc<<5);i++){
|
||||
pEncCb[i]=rand()%256;
|
||||
pEncCr[i]=rand()%256;
|
||||
}
|
||||
for(int i=0;i<512;i++)
|
||||
pDstChma[i]=rand()%256;
|
||||
tmpa = WelsSampleSatdIntra8x8Combined3_c(pDecCb+128, iLineSizeDec, pEncCb,iLineSizeEnc,&iBestMode_c, iLambda, pDstChma, pDecCr+128, pEncCr);
|
||||
tmpb = WelsIntraChroma8x8Combined3Satd_sse41(pDecCb+128, iLineSizeDec, pEncCb,iLineSizeEnc,&iBestMode_a, iLambda, pDstChma, pDecCr+128, pEncCr);
|
||||
ASSERT_EQ(tmpa, tmpb);
|
||||
ASSERT_EQ(iBestMode_c, iBestMode_a);
|
||||
cMemoryAlign.WelsFree(pDecCb,"pDecCb");
|
||||
cMemoryAlign.WelsFree(pEncCb,"pEncCb");
|
||||
cMemoryAlign.WelsFree(pDecCr,"pDecCr");
|
||||
cMemoryAlign.WelsFree(pEncCr,"pEncCr");
|
||||
cMemoryAlign.WelsFree(pDstChma,"pDstChma");
|
||||
}
|
||||
#endif
|
||||
#define ASSERT_MEMORY_FAIL2X(A, B) \
|
||||
if (NULL == B) { \
|
||||
pMemAlign->WelsFree(A, "Sad_SrcA");\
|
||||
|
Loading…
x
Reference in New Issue
Block a user