Merge "Make hor UV predict ~2x faster (73 vs 132 cycles) using SSSE3."

This commit is contained in:
Scott LaVarnway 2011-05-19 11:22:01 -07:00 committed by Code Review
commit 914f7c36d7
2 changed files with 67 additions and 10 deletions

View File

@ -584,23 +584,35 @@ sym(vp8_intra_pred_uv_ve_mmx):
; unsigned char *src,
; int src_stride,
; )
global sym(vp8_intra_pred_uv_ho_mmx2)
sym(vp8_intra_pred_uv_ho_mmx2):
%macro vp8_intra_pred_uv_ho 1
global sym(vp8_intra_pred_uv_ho_%1)
sym(vp8_intra_pred_uv_ho_%1):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
%ifidn %1, ssse3
push rbx
%endif
; end prolog
; read from left and write out
%ifidn %1, mmx2
mov edx, 4
%endif
mov rsi, arg(2) ;src;
movsxd rax, dword ptr arg(3) ;src_stride;
mov rdi, arg(0) ;dst;
movsxd rcx, dword ptr arg(1) ;dst_stride
%ifidn %1, ssse3
lea rbx, [rax*3]
lea rdx, [rcx*3]
movdqa xmm2, [GLOBAL(dc_00001111)]
%endif
dec rsi
vp8_intra_pred_uv_ho_mmx2_loop:
%ifidn %1, mmx2
vp8_intra_pred_uv_ho_%1_loop:
movd mm0, [rsi]
movd mm1, [rsi+rax]
punpcklbw mm0, mm0
@ -612,14 +624,49 @@ vp8_intra_pred_uv_ho_mmx2_loop:
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rcx*2]
dec edx
jnz vp8_intra_pred_uv_ho_mmx2_loop
jnz vp8_intra_pred_uv_ho_%1_loop
%else
movd xmm0, [rsi]
movd xmm3, [rsi+rax]
movd xmm1, [rsi+rax*2]
movd xmm4, [rsi+rbx]
punpcklbw xmm0, xmm3
punpcklbw xmm1, xmm4
pshufb xmm0, xmm2
pshufb xmm1, xmm2
movq [rdi ], xmm0
movhps [rdi+rcx], xmm0
movq [rdi+rcx*2], xmm1
movhps [rdi+rdx], xmm1
lea rsi, [rsi+rax*4]
lea rdi, [rdi+rcx*4]
movd xmm0, [rsi]
movd xmm3, [rsi+rax]
movd xmm1, [rsi+rax*2]
movd xmm4, [rsi+rbx]
punpcklbw xmm0, xmm3
punpcklbw xmm1, xmm4
pshufb xmm0, xmm2
pshufb xmm1, xmm2
movq [rdi ], xmm0
movhps [rdi+rcx], xmm0
movq [rdi+rcx*2], xmm1
movhps [rdi+rdx], xmm1
%endif
; begin epilog
%ifidn %1, ssse3
pop rbx
%endif
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
%endmacro
vp8_intra_pred_uv_ho mmx2
vp8_intra_pred_uv_ho ssse3
SECTION_RODATA
dc_128:
@ -629,3 +676,7 @@ dc_4:
align 16
dc_1024:
times 8 dw 0x400
align 16
dc_00001111:
times 8 db 0
times 8 db 1

View File

@ -23,6 +23,7 @@ extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dctop_mmx2);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dcleft_mmx2);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc128_mmx);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_mmx2);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_ssse3);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ve_mmx);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3);
@ -31,7 +32,8 @@ static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
unsigned char *dst_u,
unsigned char *dst_v,
int dst_stride,
build_intra_predictors_mbuv_fn_t tm_func)
build_intra_predictors_mbuv_fn_t tm_func,
build_intra_predictors_mbuv_fn_t ho_func)
{
int mode = x->mode_info_context->mbmi.uv_mode;
build_intra_predictors_mbuv_fn_t fn;
@ -39,7 +41,7 @@ static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
switch (mode) {
case V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break;
case H_PRED: fn = vp8_intra_pred_uv_ho_mmx2; break;
case H_PRED: fn = ho_func; break;
case TM_PRED: fn = tm_func; break;
case DC_PRED:
if (x->up_available) {
@ -65,26 +67,30 @@ void vp8_build_intra_predictors_mbuv_sse2(MACROBLOCKD *x)
{
vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
&x->predictor[320], 8,
vp8_intra_pred_uv_tm_sse2);
vp8_intra_pred_uv_tm_sse2,
vp8_intra_pred_uv_ho_mmx2);
}
void vp8_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *x)
{
vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
&x->predictor[320], 8,
vp8_intra_pred_uv_tm_ssse3);
vp8_intra_pred_uv_tm_ssse3,
vp8_intra_pred_uv_ho_ssse3);
}
void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x)
{
vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
x->dst.v_buffer, x->dst.uv_stride,
vp8_intra_pred_uv_tm_sse2);
vp8_intra_pred_uv_tm_sse2,
vp8_intra_pred_uv_ho_mmx2);
}
void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x)
{
vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
x->dst.v_buffer, x->dst.uv_stride,
vp8_intra_pred_uv_tm_ssse3);
vp8_intra_pred_uv_tm_ssse3,
vp8_intra_pred_uv_ho_ssse3);
}