Merge "Replacing asm 8x8 variance calculation with intrinsics."
This commit is contained in:
commit
dbe2170595
@ -209,193 +209,3 @@ sym(vp9_get16x16var_sse2):
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
|
||||
|
||||
;unsigned int vp9_get8x8var_sse2
|
||||
;(
|
||||
; unsigned char * src_ptr,
|
||||
; int source_stride,
|
||||
; unsigned char * ref_ptr,
|
||||
; int recon_stride,
|
||||
; unsigned int * SSE,
|
||||
; int * Sum
|
||||
;)
|
||||
global sym(vp9_get8x8var_sse2) PRIVATE
|
||||
sym(vp9_get8x8var_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
sub rsp, 16
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(0) ;[src_ptr]
|
||||
mov rdi, arg(2) ;[ref_ptr]
|
||||
|
||||
movsxd rax, DWORD PTR arg(1) ;[source_stride]
|
||||
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
|
||||
|
||||
pxor xmm0, xmm0 ; clear xmm0 for unpack
|
||||
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
|
||||
|
||||
movq xmm1, QWORD PTR [rsi]
|
||||
movq xmm2, QWORD PTR [rdi]
|
||||
|
||||
punpcklbw xmm1, xmm0
|
||||
punpcklbw xmm2, xmm0
|
||||
|
||||
psubsw xmm1, xmm2
|
||||
paddw xmm7, xmm1
|
||||
|
||||
pmaddwd xmm1, xmm1
|
||||
|
||||
movq xmm2, QWORD PTR[rsi + rax]
|
||||
movq xmm3, QWORD PTR[rdi + rdx]
|
||||
|
||||
punpcklbw xmm2, xmm0
|
||||
punpcklbw xmm3, xmm0
|
||||
|
||||
psubsw xmm2, xmm3
|
||||
paddw xmm7, xmm2
|
||||
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm1, xmm2
|
||||
|
||||
|
||||
movq xmm2, QWORD PTR[rsi + rax * 2]
|
||||
movq xmm3, QWORD PTR[rdi + rdx * 2]
|
||||
|
||||
punpcklbw xmm2, xmm0
|
||||
punpcklbw xmm3, xmm0
|
||||
|
||||
psubsw xmm2, xmm3
|
||||
paddw xmm7, xmm2
|
||||
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm1, xmm2
|
||||
|
||||
|
||||
lea rsi, [rsi + rax * 2]
|
||||
lea rdi, [rdi + rdx * 2]
|
||||
movq xmm2, QWORD PTR[rsi + rax]
|
||||
movq xmm3, QWORD PTR[rdi + rdx]
|
||||
|
||||
punpcklbw xmm2, xmm0
|
||||
punpcklbw xmm3, xmm0
|
||||
|
||||
psubsw xmm2, xmm3
|
||||
paddw xmm7, xmm2
|
||||
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm1, xmm2
|
||||
|
||||
movq xmm2, QWORD PTR[rsi + rax *2]
|
||||
movq xmm3, QWORD PTR[rdi + rdx *2]
|
||||
|
||||
punpcklbw xmm2, xmm0
|
||||
punpcklbw xmm3, xmm0
|
||||
|
||||
psubsw xmm2, xmm3
|
||||
paddw xmm7, xmm2
|
||||
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm1, xmm2
|
||||
|
||||
|
||||
lea rsi, [rsi + rax * 2]
|
||||
lea rdi, [rdi + rdx * 2]
|
||||
|
||||
|
||||
movq xmm2, QWORD PTR[rsi + rax]
|
||||
movq xmm3, QWORD PTR[rdi + rdx]
|
||||
|
||||
punpcklbw xmm2, xmm0
|
||||
punpcklbw xmm3, xmm0
|
||||
|
||||
psubsw xmm2, xmm3
|
||||
paddw xmm7, xmm2
|
||||
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm1, xmm2
|
||||
|
||||
movq xmm2, QWORD PTR[rsi + rax *2]
|
||||
movq xmm3, QWORD PTR[rdi + rdx *2]
|
||||
|
||||
punpcklbw xmm2, xmm0
|
||||
punpcklbw xmm3, xmm0
|
||||
|
||||
psubsw xmm2, xmm3
|
||||
paddw xmm7, xmm2
|
||||
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm1, xmm2
|
||||
|
||||
|
||||
lea rsi, [rsi + rax * 2]
|
||||
lea rdi, [rdi + rdx * 2]
|
||||
|
||||
movq xmm2, QWORD PTR[rsi + rax]
|
||||
movq xmm3, QWORD PTR[rdi + rdx]
|
||||
|
||||
punpcklbw xmm2, xmm0
|
||||
punpcklbw xmm3, xmm0
|
||||
|
||||
psubsw xmm2, xmm3
|
||||
paddw xmm7, xmm2
|
||||
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm1, xmm2
|
||||
|
||||
|
||||
movdqa xmm6, xmm7
|
||||
punpcklwd xmm6, xmm0
|
||||
|
||||
punpckhwd xmm7, xmm0
|
||||
movdqa xmm2, xmm1
|
||||
|
||||
paddw xmm6, xmm7
|
||||
punpckldq xmm1, xmm0
|
||||
|
||||
punpckhdq xmm2, xmm0
|
||||
movdqa xmm7, xmm6
|
||||
|
||||
paddd xmm1, xmm2
|
||||
punpckldq xmm6, xmm0
|
||||
|
||||
punpckhdq xmm7, xmm0
|
||||
paddw xmm6, xmm7
|
||||
|
||||
movdqa xmm2, xmm1
|
||||
movdqa xmm7, xmm6
|
||||
|
||||
psrldq xmm1, 8
|
||||
psrldq xmm6, 8
|
||||
|
||||
paddw xmm7, xmm6
|
||||
paddd xmm1, xmm2
|
||||
|
||||
mov rax, arg(5) ;[Sum]
|
||||
mov rdi, arg(4) ;[SSE]
|
||||
|
||||
movq rdx, xmm7
|
||||
movsx rcx, dx
|
||||
|
||||
mov dword ptr [rax], ecx
|
||||
movd DWORD PTR [rdi], xmm1
|
||||
|
||||
; begin epilog
|
||||
add rsp, 16
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
|
@ -51,9 +51,46 @@ unsigned int vp9_get4x4var_sse2(const uint8_t *src, int src_stride,
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned int vp9_get8x8var_sse2(const unsigned char *src, int src_stride,
|
||||
const unsigned char *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum);
|
||||
unsigned int vp9_get8x8var_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i vsum = _mm_setzero_si128();
|
||||
__m128i vsse = _mm_setzero_si128();
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 8; i += 2) {
|
||||
const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
|
||||
(const __m128i *)(src + i * src_stride)), zero);
|
||||
const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
|
||||
(const __m128i *)(ref + i * ref_stride)), zero);
|
||||
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
|
||||
|
||||
const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
|
||||
(const __m128i *)(src + (i + 1) * src_stride)), zero);
|
||||
const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
|
||||
(const __m128i *)(ref + (i + 1) * ref_stride)), zero);
|
||||
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
|
||||
|
||||
vsum = _mm_add_epi16(vsum, diff0);
|
||||
vsum = _mm_add_epi16(vsum, diff1);
|
||||
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
|
||||
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
|
||||
}
|
||||
|
||||
// sum
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
|
||||
*sum = (int16_t)_mm_extract_epi16(vsum, 0);
|
||||
|
||||
// sse
|
||||
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
|
||||
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
|
||||
*sse = _mm_cvtsi128_si32(vsse);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned int vp9_get16x16var_sse2(const unsigned char *src, int src_stride,
|
||||
const unsigned char *ref, int ref_stride,
|
||||
@ -110,8 +147,7 @@ unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride,
|
||||
const unsigned char *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
|
||||
sse, &sum, vp9_get8x8var_sse2, 8);
|
||||
vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
|
||||
return *sse - (((unsigned int)sum * sum) >> 6);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user