Write SSSE3 sub-pixel filter function

1. Process 16 pixels at one time instead of 8.
2. Add check for both xoffset =0 and yoffset=0, which happens
   during motion search.
This change gave encoder 1%~3% performance gain.

Change-Id: Idaa39506b48f4f8b2fbbeb45aae8226fa32afb3e
This commit is contained in:
Yunqing Wang 2011-03-03 19:02:45 -05:00
parent cfaee9f7c6
commit 244e2e1451
5 changed files with 496 additions and 0 deletions

View File

@ -0,0 +1,348 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%define xmm_filter_shift 7
;void vp8_filter_block2d_bil_var_ssse3
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
; int xoffset,
; int yoffset,
; int *sum,
; unsigned int *sumsquared;;
;
;)
;Note: The filter coefficient at offset=0 is 128. Since the second register
;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
global sym(vp8_filter_block2d_bil_var_ssse3)
sym(vp8_filter_block2d_bil_var_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
SAVE_XMM
GET_GOT rbx
push rsi
push rdi
push rbx
; end prolog
pxor xmm6, xmm6
pxor xmm7, xmm7
lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
movsxd rax, dword ptr arg(5) ; xoffset
cmp rax, 0 ; skip first_pass filter if xoffset=0
je filter_block2d_bil_var_ssse3_sp_only
shl rax, 4 ; point to filter coeff with xoffset
lea rax, [rax + rcx] ; HFilter
movsxd rdx, dword ptr arg(6) ; yoffset
cmp rdx, 0 ; skip second_pass filter if yoffset=0
je filter_block2d_bil_var_ssse3_fp_only
shl rdx, 4
lea rdx, [rdx + rcx] ; VFilter
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi+1]
movdqa xmm2, xmm0
punpcklbw xmm0, xmm1
punpckhbw xmm2, xmm1
pmaddubsw xmm0, [rax]
pmaddubsw xmm2, [rax]
paddw xmm0, [GLOBAL(xmm_bi_rd)]
paddw xmm2, [GLOBAL(xmm_bi_rd)]
psraw xmm0, xmm_filter_shift
psraw xmm2, xmm_filter_shift
packuswb xmm0, xmm2
movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
lea rsi, [rsi + rbx]
%if ABI_IS_32BIT=0
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
%endif
filter_block2d_bil_var_ssse3_loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rsi+1]
movdqa xmm3, xmm1
punpcklbw xmm1, xmm2
punpckhbw xmm3, xmm2
pmaddubsw xmm1, [rax]
pmaddubsw xmm3, [rax]
paddw xmm1, [GLOBAL(xmm_bi_rd)]
paddw xmm3, [GLOBAL(xmm_bi_rd)]
psraw xmm1, xmm_filter_shift
psraw xmm3, xmm_filter_shift
packuswb xmm1, xmm3
movdqa xmm2, xmm0
movdqa xmm0, xmm1
movdqa xmm3, xmm2
punpcklbw xmm2, xmm1
punpckhbw xmm3, xmm1
pmaddubsw xmm2, [rdx]
pmaddubsw xmm3, [rdx]
paddw xmm2, [GLOBAL(xmm_bi_rd)]
paddw xmm3, [GLOBAL(xmm_bi_rd)]
psraw xmm2, xmm_filter_shift
psraw xmm3, xmm_filter_shift
movq xmm1, QWORD PTR [rdi]
pxor xmm4, xmm4
punpcklbw xmm1, xmm4
movq xmm5, QWORD PTR [rdi+8]
punpcklbw xmm5, xmm4
psubw xmm2, xmm1
psubw xmm3, xmm5
paddw xmm6, xmm2
paddw xmm6, xmm3
pmaddwd xmm2, xmm2
pmaddwd xmm3, xmm3
paddd xmm7, xmm2
paddd xmm7, xmm3
lea rsi, [rsi + rbx] ;ref_pixels_per_line
%if ABI_IS_32BIT
add rdi, dword ptr arg(3) ;src_pixels_per_line
%else
lea rdi, [rdi + r9]
%endif
sub rcx, 1
jnz filter_block2d_bil_var_ssse3_loop
jmp filter_block2d_bil_variance
filter_block2d_bil_var_ssse3_sp_only:
movsxd rdx, dword ptr arg(6) ; yoffset
cmp rdx, 0 ; Both xoffset =0 and yoffset=0
je filter_block2d_bil_var_ssse3_full_pixel
shl rdx, 4
lea rdx, [rdx + rcx] ; VFilter
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
movdqu xmm1, XMMWORD PTR [rsi]
movdqa xmm0, xmm1
movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
lea rsi, [rsi + rax]
filter_block2d_bil_sp_only_loop:
movdqu xmm3, XMMWORD PTR [rsi]
movdqa xmm2, xmm1
movdqa xmm0, xmm3
punpcklbw xmm1, xmm3
punpckhbw xmm2, xmm3
pmaddubsw xmm1, [rdx]
pmaddubsw xmm2, [rdx]
paddw xmm1, [GLOBAL(xmm_bi_rd)]
paddw xmm2, [GLOBAL(xmm_bi_rd)]
psraw xmm1, xmm_filter_shift
psraw xmm2, xmm_filter_shift
movq xmm3, QWORD PTR [rdi]
pxor xmm4, xmm4
punpcklbw xmm3, xmm4
movq xmm5, QWORD PTR [rdi+8]
punpcklbw xmm5, xmm4
psubw xmm1, xmm3
psubw xmm2, xmm5
paddw xmm6, xmm1
paddw xmm6, xmm2
pmaddwd xmm1, xmm1
pmaddwd xmm2, xmm2
paddd xmm7, xmm1
paddd xmm7, xmm2
movdqa xmm1, xmm0
lea rsi, [rsi + rax] ;ref_pixels_per_line
lea rdi, [rdi + rbx] ;src_pixels_per_line
sub rcx, 1
jnz filter_block2d_bil_sp_only_loop
jmp filter_block2d_bil_variance
filter_block2d_bil_var_ssse3_full_pixel:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
pxor xmm0, xmm0
filter_block2d_bil_full_pixel_loop:
movq xmm1, QWORD PTR [rsi]
punpcklbw xmm1, xmm0
movq xmm2, QWORD PTR [rsi+8]
punpcklbw xmm2, xmm0
movq xmm3, QWORD PTR [rdi]
punpcklbw xmm3, xmm0
movq xmm4, QWORD PTR [rdi+8]
punpcklbw xmm4, xmm0
psubw xmm1, xmm3
psubw xmm2, xmm4
paddw xmm6, xmm1
paddw xmm6, xmm2
pmaddwd xmm1, xmm1
pmaddwd xmm2, xmm2
paddd xmm7, xmm1
paddd xmm7, xmm2
lea rsi, [rsi + rax] ;ref_pixels_per_line
lea rdi, [rdi + rbx] ;src_pixels_per_line
sub rcx, 1
jnz filter_block2d_bil_full_pixel_loop
jmp filter_block2d_bil_variance
filter_block2d_bil_var_ssse3_fp_only:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
pxor xmm0, xmm0
movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
filter_block2d_bil_fp_only_loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rsi+1]
movdqa xmm3, xmm1
punpcklbw xmm1, xmm2
punpckhbw xmm3, xmm2
pmaddubsw xmm1, [rax]
pmaddubsw xmm3, [rax]
paddw xmm1, [GLOBAL(xmm_bi_rd)]
paddw xmm3, [GLOBAL(xmm_bi_rd)]
psraw xmm1, xmm_filter_shift
psraw xmm3, xmm_filter_shift
movq xmm2, XMMWORD PTR [rdi]
pxor xmm4, xmm4
punpcklbw xmm2, xmm4
movq xmm5, QWORD PTR [rdi+8]
punpcklbw xmm5, xmm4
psubw xmm1, xmm2
psubw xmm3, xmm5
paddw xmm6, xmm1
paddw xmm6, xmm3
pmaddwd xmm1, xmm1
pmaddwd xmm3, xmm3
paddd xmm7, xmm1
paddd xmm7, xmm3
lea rsi, [rsi + rdx]
lea rdi, [rdi + rbx] ;src_pixels_per_line
sub rcx, 1
jnz filter_block2d_bil_fp_only_loop
jmp filter_block2d_bil_variance
filter_block2d_bil_variance:
pxor xmm0, xmm0
pxor xmm1, xmm1
pxor xmm5, xmm5
punpcklwd xmm0, xmm6
punpckhwd xmm1, xmm6
psrad xmm0, 16
psrad xmm1, 16
paddd xmm0, xmm1
movdqa xmm1, xmm0
movdqa xmm6, xmm7
punpckldq xmm6, xmm5
punpckhdq xmm7, xmm5
paddd xmm6, xmm7
punpckldq xmm0, xmm5
punpckhdq xmm1, xmm5
paddd xmm0, xmm1
movdqa xmm7, xmm6
movdqa xmm1, xmm0
psrldq xmm7, 8
psrldq xmm1, 8
paddd xmm6, xmm7
paddd xmm0, xmm1
mov rsi, arg(7) ;[Sum]
mov rdi, arg(8) ;[SSE]
movd [rsi], xmm0
movd [rdi], xmm6
; begin epilog
pop rbx
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
align 16
xmm_bi_rd:
times 8 dw 64
align 16
vp8_bilinear_filters_ssse3:
times 8 db 128, 0
times 8 db 112, 16
times 8 db 96, 32
times 8 db 80, 48
times 8 db 64, 64
times 8 db 48, 80
times 8 db 32, 96
times 8 db 16, 112

View File

@ -0,0 +1,140 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vp8/encoder/variance.h"
#include "vp8/common/pragmas.h"
#include "vpx_ports/mem.h"
extern unsigned int vp8_get16x16var_sse2
(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *SSE,
int *Sum
);
extern void vp8_half_horiz_vert_variance16x_h_sse2
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared
);
extern void vp8_half_horiz_variance16x_h_sse2
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared
);
extern void vp8_half_vert_variance16x_h_sse2
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared
);
extern void vp8_filter_block2d_bil_var_ssse3
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int xoffset,
int yoffset,
int *sum,
unsigned int *sumsquared
);
unsigned int vp8_sub_pixel_variance16x16_ssse3
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
int xsum0, xsum1;
unsigned int xxsum0, xxsum1;
// note we could avoid these if statements if the calling function
// just called the appropriate functions inside.
if (xoffset == 4 && yoffset == 0)
{
vp8_half_horiz_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
vp8_half_horiz_variance16x_h_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
&xsum1, &xxsum1);
xsum0 += xsum1;
xxsum0 += xxsum1;
}
else if (xoffset == 0 && yoffset == 4)
{
vp8_half_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
vp8_half_vert_variance16x_h_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
&xsum1, &xxsum1);
xsum0 += xsum1;
xxsum0 += xxsum1;
}
else if (xoffset == 4 && yoffset == 4)
{
vp8_half_horiz_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
vp8_half_horiz_vert_variance16x_h_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
&xsum1, &xxsum1);
xsum0 += xsum1;
xxsum0 += xxsum1;
}
else
{
vp8_filter_block2d_bil_var_ssse3(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
xoffset, yoffset,
&xsum0, &xxsum0);
}
*sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 8));
}

View File

@ -286,6 +286,7 @@ extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3);
#if HAVE_SSSE3
extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3);
extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_ssse3);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_variance_sad16x16x3
@ -294,6 +295,9 @@ extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
#undef vp8_variance_sad16x8x3
#define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3
#undef vp8_variance_subpixvar16x16
#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_ssse3
#endif
#endif

View File

@ -334,6 +334,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3;
cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3;
cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_ssse3;
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3;
}

View File

@ -110,6 +110,8 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c
VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_impl_ssse3.asm
VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm