Replacing asm 16x16 variance calculation with intrinsics.

New code is 20% faster for 64-bit and 15% faster for 32-bit. Compiled
using clang.

Change-Id: Icfea461238411001fd093561293dbfedfbf8d0bb
This commit is contained in:
Dmitry Kovalev 2014-09-02 10:50:12 -07:00
parent 6b649a0db9
commit 6f6bd282c9
2 changed files with 44 additions and 147 deletions

View File

@ -67,145 +67,3 @@ sym(vp9_get_mb_ss_sse2):
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp9_get16x16var_sse2
;(
; unsigned char * src_ptr,
; int source_stride,
; unsigned char * ref_ptr,
; int recon_stride,
; unsigned int * SSE,
; int * Sum
;)
global sym(vp9_get16x16var_sse2) PRIVATE
sym(vp9_get16x16var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
push rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;[src_ptr]
mov rdi, arg(2) ;[ref_ptr]
movsxd rax, DWORD PTR arg(1) ;[source_stride]
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
; Prefetch data
lea rcx, [rax+rax*2]
prefetcht0 [rsi]
prefetcht0 [rsi+rax]
prefetcht0 [rsi+rax*2]
prefetcht0 [rsi+rcx]
lea rbx, [rsi+rax*4]
prefetcht0 [rbx]
prefetcht0 [rbx+rax]
prefetcht0 [rbx+rax*2]
prefetcht0 [rbx+rcx]
lea rcx, [rdx+rdx*2]
prefetcht0 [rdi]
prefetcht0 [rdi+rdx]
prefetcht0 [rdi+rdx*2]
prefetcht0 [rdi+rcx]
lea rbx, [rdi+rdx*4]
prefetcht0 [rbx]
prefetcht0 [rbx+rdx]
prefetcht0 [rbx+rdx*2]
prefetcht0 [rbx+rcx]
pxor xmm0, xmm0 ; clear xmm0 for unpack
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
mov rcx, 16
.var16loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rdi]
prefetcht0 [rsi+rax*8]
prefetcht0 [rdi+rdx*8]
movdqa xmm3, xmm1
movdqa xmm4, xmm2
punpcklbw xmm1, xmm0
punpckhbw xmm3, xmm0
punpcklbw xmm2, xmm0
punpckhbw xmm4, xmm0
psubw xmm1, xmm2
psubw xmm3, xmm4
paddw xmm7, xmm1
pmaddwd xmm1, xmm1
paddw xmm7, xmm3
pmaddwd xmm3, xmm3
paddd xmm6, xmm1
paddd xmm6, xmm3
add rsi, rax
add rdi, rdx
sub rcx, 1
jnz .var16loop
movdqa xmm1, xmm6
pxor xmm6, xmm6
pxor xmm5, xmm5
punpcklwd xmm6, xmm7
punpckhwd xmm5, xmm7
psrad xmm5, 16
psrad xmm6, 16
paddd xmm6, xmm5
movdqa xmm2, xmm1
punpckldq xmm1, xmm0
punpckhdq xmm2, xmm0
movdqa xmm7, xmm6
paddd xmm1, xmm2
punpckldq xmm6, xmm0
punpckhdq xmm7, xmm0
paddd xmm6, xmm7
movdqa xmm2, xmm1
movdqa xmm7, xmm6
psrldq xmm1, 8
psrldq xmm6, 8
paddd xmm7, xmm6
paddd xmm1, xmm2
mov rax, arg(5) ;[Sum]
mov rdi, arg(4) ;[SSE]
movd DWORD PTR [rax], xmm7
movd DWORD PTR [rdi], xmm1
; begin epilog
pop rdi
pop rsi
pop rbx
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret

View File

@ -92,9 +92,49 @@ unsigned int vp9_get8x8var_sse2(const uint8_t *src, int src_stride,
return 0;
}
unsigned int vp9_get16x16var_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse, int *sum);
unsigned int vp9_get16x16var_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse, int *sum) {
const __m128i zero = _mm_setzero_si128();
__m128i vsum = _mm_setzero_si128();
__m128i vsse = _mm_setzero_si128();
int i;
for (i = 0; i < 16; ++i) {
const __m128i s = _mm_loadu_si128((const __m128i *)src);
const __m128i r = _mm_loadu_si128((const __m128i *)ref);
const __m128i src0 = _mm_unpacklo_epi8(s, zero);
const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
const __m128i src1 = _mm_unpackhi_epi8(s, zero);
const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
vsum = _mm_add_epi16(vsum, diff0);
vsum = _mm_add_epi16(vsum, diff1);
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
src += src_stride;
ref += ref_stride;
}
// sum
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
*sum = (int16_t)_mm_extract_epi16(vsum, 0) +
(int16_t)_mm_extract_epi16(vsum, 1);
// sse
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
*sse = _mm_cvtsi128_si32(vsse);
return 0;
}
static void variance_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
@ -173,8 +213,7 @@ unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
sse, &sum, vp9_get16x16var_sse2, 16);
vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
return *sse - (((unsigned int)sum * sum) >> 8);
}