Merge "Make hor UV predict ~2x faster (73 vs 132 cycles) using SSSE3."
This commit is contained in:
commit
914f7c36d7
@ -584,23 +584,35 @@ sym(vp8_intra_pred_uv_ve_mmx):
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; )
|
||||
global sym(vp8_intra_pred_uv_ho_mmx2)
|
||||
sym(vp8_intra_pred_uv_ho_mmx2):
|
||||
%macro vp8_intra_pred_uv_ho 1
|
||||
global sym(vp8_intra_pred_uv_ho_%1)
|
||||
sym(vp8_intra_pred_uv_ho_%1):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
push rsi
|
||||
push rdi
|
||||
%ifidn %1, ssse3
|
||||
push rbx
|
||||
%endif
|
||||
; end prolog
|
||||
|
||||
; read from left and write out
|
||||
%ifidn %1, mmx2
|
||||
mov edx, 4
|
||||
%endif
|
||||
mov rsi, arg(2) ;src;
|
||||
movsxd rax, dword ptr arg(3) ;src_stride;
|
||||
mov rdi, arg(0) ;dst;
|
||||
movsxd rcx, dword ptr arg(1) ;dst_stride
|
||||
%ifidn %1, ssse3
|
||||
lea rbx, [rax*3]
|
||||
lea rdx, [rcx*3]
|
||||
movdqa xmm2, [GLOBAL(dc_00001111)]
|
||||
%endif
|
||||
dec rsi
|
||||
vp8_intra_pred_uv_ho_mmx2_loop:
|
||||
%ifidn %1, mmx2
|
||||
vp8_intra_pred_uv_ho_%1_loop:
|
||||
movd mm0, [rsi]
|
||||
movd mm1, [rsi+rax]
|
||||
punpcklbw mm0, mm0
|
||||
@ -612,14 +624,49 @@ vp8_intra_pred_uv_ho_mmx2_loop:
|
||||
lea rsi, [rsi+rax*2]
|
||||
lea rdi, [rdi+rcx*2]
|
||||
dec edx
|
||||
jnz vp8_intra_pred_uv_ho_mmx2_loop
|
||||
jnz vp8_intra_pred_uv_ho_%1_loop
|
||||
%else
|
||||
movd xmm0, [rsi]
|
||||
movd xmm3, [rsi+rax]
|
||||
movd xmm1, [rsi+rax*2]
|
||||
movd xmm4, [rsi+rbx]
|
||||
punpcklbw xmm0, xmm3
|
||||
punpcklbw xmm1, xmm4
|
||||
pshufb xmm0, xmm2
|
||||
pshufb xmm1, xmm2
|
||||
movq [rdi ], xmm0
|
||||
movhps [rdi+rcx], xmm0
|
||||
movq [rdi+rcx*2], xmm1
|
||||
movhps [rdi+rdx], xmm1
|
||||
lea rsi, [rsi+rax*4]
|
||||
lea rdi, [rdi+rcx*4]
|
||||
movd xmm0, [rsi]
|
||||
movd xmm3, [rsi+rax]
|
||||
movd xmm1, [rsi+rax*2]
|
||||
movd xmm4, [rsi+rbx]
|
||||
punpcklbw xmm0, xmm3
|
||||
punpcklbw xmm1, xmm4
|
||||
pshufb xmm0, xmm2
|
||||
pshufb xmm1, xmm2
|
||||
movq [rdi ], xmm0
|
||||
movhps [rdi+rcx], xmm0
|
||||
movq [rdi+rcx*2], xmm1
|
||||
movhps [rdi+rdx], xmm1
|
||||
%endif
|
||||
|
||||
; begin epilog
|
||||
%ifidn %1, ssse3
|
||||
pop rbx
|
||||
%endif
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
vp8_intra_pred_uv_ho mmx2
|
||||
vp8_intra_pred_uv_ho ssse3
|
||||
|
||||
SECTION_RODATA
|
||||
dc_128:
|
||||
@ -629,3 +676,7 @@ dc_4:
|
||||
align 16
|
||||
dc_1024:
|
||||
times 8 dw 0x400
|
||||
align 16
|
||||
dc_00001111:
|
||||
times 8 db 0
|
||||
times 8 db 1
|
||||
|
@ -23,6 +23,7 @@ extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dctop_mmx2);
|
||||
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dcleft_mmx2);
|
||||
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc128_mmx);
|
||||
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_mmx2);
|
||||
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_ssse3);
|
||||
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ve_mmx);
|
||||
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2);
|
||||
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3);
|
||||
@ -31,7 +32,8 @@ static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
|
||||
unsigned char *dst_u,
|
||||
unsigned char *dst_v,
|
||||
int dst_stride,
|
||||
build_intra_predictors_mbuv_fn_t tm_func)
|
||||
build_intra_predictors_mbuv_fn_t tm_func,
|
||||
build_intra_predictors_mbuv_fn_t ho_func)
|
||||
{
|
||||
int mode = x->mode_info_context->mbmi.uv_mode;
|
||||
build_intra_predictors_mbuv_fn_t fn;
|
||||
@ -39,7 +41,7 @@ static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
|
||||
|
||||
switch (mode) {
|
||||
case V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break;
|
||||
case H_PRED: fn = vp8_intra_pred_uv_ho_mmx2; break;
|
||||
case H_PRED: fn = ho_func; break;
|
||||
case TM_PRED: fn = tm_func; break;
|
||||
case DC_PRED:
|
||||
if (x->up_available) {
|
||||
@ -65,26 +67,30 @@ void vp8_build_intra_predictors_mbuv_sse2(MACROBLOCKD *x)
|
||||
{
|
||||
vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
|
||||
&x->predictor[320], 8,
|
||||
vp8_intra_pred_uv_tm_sse2);
|
||||
vp8_intra_pred_uv_tm_sse2,
|
||||
vp8_intra_pred_uv_ho_mmx2);
|
||||
}
|
||||
|
||||
void vp8_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *x)
|
||||
{
|
||||
vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
|
||||
&x->predictor[320], 8,
|
||||
vp8_intra_pred_uv_tm_ssse3);
|
||||
vp8_intra_pred_uv_tm_ssse3,
|
||||
vp8_intra_pred_uv_ho_ssse3);
|
||||
}
|
||||
|
||||
void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x)
|
||||
{
|
||||
vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
|
||||
x->dst.v_buffer, x->dst.uv_stride,
|
||||
vp8_intra_pred_uv_tm_sse2);
|
||||
vp8_intra_pred_uv_tm_sse2,
|
||||
vp8_intra_pred_uv_ho_mmx2);
|
||||
}
|
||||
|
||||
void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x)
|
||||
{
|
||||
vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
|
||||
x->dst.v_buffer, x->dst.uv_stride,
|
||||
vp8_intra_pred_uv_tm_ssse3);
|
||||
vp8_intra_pred_uv_tm_ssse3,
|
||||
vp8_intra_pred_uv_ho_ssse3);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user