Replacing asm 16x16 variance calculation with intrinsics.
New code is 20% faster for 64-bit and 15% faster for 32-bit. Compiled using clang. Change-Id: Icfea461238411001fd093561293dbfedfbf8d0bb
This commit is contained in:
parent
6b649a0db9
commit
6f6bd282c9
@ -67,145 +67,3 @@ sym(vp9_get_mb_ss_sse2):
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;unsigned int vp9_get16x16var_sse2
|
||||
;(
|
||||
; unsigned char * src_ptr,
|
||||
; int source_stride,
|
||||
; unsigned char * ref_ptr,
|
||||
; int recon_stride,
|
||||
; unsigned int * SSE,
|
||||
; int * Sum
|
||||
;)
|
||||
global sym(vp9_get16x16var_sse2) PRIVATE
|
||||
sym(vp9_get16x16var_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
SAVE_XMM 7
|
||||
push rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(0) ;[src_ptr]
|
||||
mov rdi, arg(2) ;[ref_ptr]
|
||||
|
||||
movsxd rax, DWORD PTR arg(1) ;[source_stride]
|
||||
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
|
||||
|
||||
; Prefetch data
|
||||
lea rcx, [rax+rax*2]
|
||||
prefetcht0 [rsi]
|
||||
prefetcht0 [rsi+rax]
|
||||
prefetcht0 [rsi+rax*2]
|
||||
prefetcht0 [rsi+rcx]
|
||||
lea rbx, [rsi+rax*4]
|
||||
prefetcht0 [rbx]
|
||||
prefetcht0 [rbx+rax]
|
||||
prefetcht0 [rbx+rax*2]
|
||||
prefetcht0 [rbx+rcx]
|
||||
|
||||
lea rcx, [rdx+rdx*2]
|
||||
prefetcht0 [rdi]
|
||||
prefetcht0 [rdi+rdx]
|
||||
prefetcht0 [rdi+rdx*2]
|
||||
prefetcht0 [rdi+rcx]
|
||||
lea rbx, [rdi+rdx*4]
|
||||
prefetcht0 [rbx]
|
||||
prefetcht0 [rbx+rdx]
|
||||
prefetcht0 [rbx+rdx*2]
|
||||
prefetcht0 [rbx+rcx]
|
||||
|
||||
pxor xmm0, xmm0 ; clear xmm0 for unpack
|
||||
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
|
||||
|
||||
pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
|
||||
mov rcx, 16
|
||||
|
||||
.var16loop:
|
||||
movdqu xmm1, XMMWORD PTR [rsi]
|
||||
movdqu xmm2, XMMWORD PTR [rdi]
|
||||
|
||||
prefetcht0 [rsi+rax*8]
|
||||
prefetcht0 [rdi+rdx*8]
|
||||
|
||||
movdqa xmm3, xmm1
|
||||
movdqa xmm4, xmm2
|
||||
|
||||
|
||||
punpcklbw xmm1, xmm0
|
||||
punpckhbw xmm3, xmm0
|
||||
|
||||
punpcklbw xmm2, xmm0
|
||||
punpckhbw xmm4, xmm0
|
||||
|
||||
|
||||
psubw xmm1, xmm2
|
||||
psubw xmm3, xmm4
|
||||
|
||||
paddw xmm7, xmm1
|
||||
pmaddwd xmm1, xmm1
|
||||
|
||||
paddw xmm7, xmm3
|
||||
pmaddwd xmm3, xmm3
|
||||
|
||||
paddd xmm6, xmm1
|
||||
paddd xmm6, xmm3
|
||||
|
||||
add rsi, rax
|
||||
add rdi, rdx
|
||||
|
||||
sub rcx, 1
|
||||
jnz .var16loop
|
||||
|
||||
|
||||
movdqa xmm1, xmm6
|
||||
pxor xmm6, xmm6
|
||||
|
||||
pxor xmm5, xmm5
|
||||
punpcklwd xmm6, xmm7
|
||||
|
||||
punpckhwd xmm5, xmm7
|
||||
psrad xmm5, 16
|
||||
|
||||
psrad xmm6, 16
|
||||
paddd xmm6, xmm5
|
||||
|
||||
movdqa xmm2, xmm1
|
||||
punpckldq xmm1, xmm0
|
||||
|
||||
punpckhdq xmm2, xmm0
|
||||
movdqa xmm7, xmm6
|
||||
|
||||
paddd xmm1, xmm2
|
||||
punpckldq xmm6, xmm0
|
||||
|
||||
punpckhdq xmm7, xmm0
|
||||
paddd xmm6, xmm7
|
||||
|
||||
movdqa xmm2, xmm1
|
||||
movdqa xmm7, xmm6
|
||||
|
||||
psrldq xmm1, 8
|
||||
psrldq xmm6, 8
|
||||
|
||||
paddd xmm7, xmm6
|
||||
paddd xmm1, xmm2
|
||||
|
||||
mov rax, arg(5) ;[Sum]
|
||||
mov rdi, arg(4) ;[SSE]
|
||||
|
||||
movd DWORD PTR [rax], xmm7
|
||||
movd DWORD PTR [rdi], xmm1
|
||||
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbx
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
@ -92,9 +92,49 @@ unsigned int vp9_get8x8var_sse2(const uint8_t *src, int src_stride,
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned int vp9_get16x16var_sse2(const unsigned char *src, int src_stride,
|
||||
const unsigned char *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum);
|
||||
unsigned int vp9_get16x16var_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i vsum = _mm_setzero_si128();
|
||||
__m128i vsse = _mm_setzero_si128();
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 16; ++i) {
|
||||
const __m128i s = _mm_loadu_si128((const __m128i *)src);
|
||||
const __m128i r = _mm_loadu_si128((const __m128i *)ref);
|
||||
|
||||
const __m128i src0 = _mm_unpacklo_epi8(s, zero);
|
||||
const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
|
||||
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
|
||||
|
||||
const __m128i src1 = _mm_unpackhi_epi8(s, zero);
|
||||
const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
|
||||
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
|
||||
|
||||
vsum = _mm_add_epi16(vsum, diff0);
|
||||
vsum = _mm_add_epi16(vsum, diff1);
|
||||
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
|
||||
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
|
||||
|
||||
src += src_stride;
|
||||
ref += ref_stride;
|
||||
}
|
||||
|
||||
// sum
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
|
||||
*sum = (int16_t)_mm_extract_epi16(vsum, 0) +
|
||||
(int16_t)_mm_extract_epi16(vsum, 1);
|
||||
|
||||
// sse
|
||||
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
|
||||
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
|
||||
*sse = _mm_cvtsi128_si32(vsse);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static void variance_sse2(const unsigned char *src, int src_stride,
|
||||
const unsigned char *ref, int ref_stride,
|
||||
@ -173,8 +213,7 @@ unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride,
|
||||
const unsigned char *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
|
||||
sse, &sum, vp9_get16x16var_sse2, 16);
|
||||
vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
|
||||
return *sse - (((unsigned int)sum * sum) >> 8);
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user