Merge "Updated vp8_build_intra_predictors_mbuv_s(sse2/ssse3)"
This commit is contained in:
@@ -126,7 +126,7 @@ prototype void vp8_build_intra_predictors_mby_s "struct macroblockd *x, unsigned
|
||||
#TODO: fix assembly --- specialize vp8_build_intra_predictors_mby_s sse2 ssse3 neon
|
||||
|
||||
prototype void vp8_build_intra_predictors_mbuv_s "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride"
|
||||
#TODO: fix assembly --- specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3
|
||||
specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3
|
||||
|
||||
prototype void vp8_intra4x4_predict "unsigned char *src, int src_stride, int b_mode, unsigned char *dst, int dst_stride"
|
||||
specialize vp8_intra4x4_predict media
|
||||
|
@@ -119,35 +119,39 @@ sym(vp8_copy_mem16x16_sse2):
|
||||
;void vp8_intra_pred_uv_dc_mmx2(
|
||||
; unsigned char *dst,
|
||||
; int dst_stride
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *above,
|
||||
; unsigned char *left,
|
||||
; int left_stride,
|
||||
; )
|
||||
global sym(vp8_intra_pred_uv_dc_mmx2)
|
||||
sym(vp8_intra_pred_uv_dc_mmx2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
; from top
|
||||
mov rsi, arg(2) ;src;
|
||||
movsxd rax, dword ptr arg(3) ;src_stride;
|
||||
sub rsi, rax
|
||||
mov rsi, arg(2) ;above;
|
||||
pxor mm0, mm0
|
||||
movq mm1, [rsi]
|
||||
psadbw mm1, mm0
|
||||
|
||||
; from left
|
||||
dec rsi
|
||||
mov rsi, arg(3) ;left;
|
||||
movsxd rax, dword ptr arg(4) ;left_stride;
|
||||
lea rdi, [rax*3]
|
||||
movzx ecx, byte [rsi+rax]
|
||||
movzx ecx, byte [rsi]
|
||||
movzx edx, byte [rsi+rax*1]
|
||||
add ecx, edx
|
||||
movzx edx, byte [rsi+rax*2]
|
||||
add ecx, edx
|
||||
|
||||
|
||||
movzx edx, byte [rsi+rdi]
|
||||
add ecx, edx
|
||||
lea rsi, [rsi+rax*4]
|
||||
add ecx, edx
|
||||
movzx edx, byte [rsi]
|
||||
add ecx, edx
|
||||
movzx edx, byte [rsi+rax]
|
||||
@@ -156,8 +160,6 @@ sym(vp8_intra_pred_uv_dc_mmx2):
|
||||
add ecx, edx
|
||||
movzx edx, byte [rsi+rdi]
|
||||
add ecx, edx
|
||||
movzx edx, byte [rsi+rax*4]
|
||||
add ecx, edx
|
||||
|
||||
; add up
|
||||
pextrw edx, mm1, 0x0
|
||||
@@ -192,23 +194,24 @@ sym(vp8_intra_pred_uv_dc_mmx2):
|
||||
;void vp8_intra_pred_uv_dctop_mmx2(
|
||||
; unsigned char *dst,
|
||||
; int dst_stride
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *above,
|
||||
; unsigned char *left,
|
||||
; int left_stride,
|
||||
; )
|
||||
global sym(vp8_intra_pred_uv_dctop_mmx2)
|
||||
sym(vp8_intra_pred_uv_dctop_mmx2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
;arg(3), arg(4) not used
|
||||
|
||||
; from top
|
||||
mov rsi, arg(2) ;src;
|
||||
movsxd rax, dword ptr arg(3) ;src_stride;
|
||||
sub rsi, rax
|
||||
mov rsi, arg(2) ;above;
|
||||
pxor mm0, mm0
|
||||
movq mm1, [rsi]
|
||||
psadbw mm1, mm0
|
||||
@@ -245,22 +248,24 @@ sym(vp8_intra_pred_uv_dctop_mmx2):
|
||||
;void vp8_intra_pred_uv_dcleft_mmx2(
|
||||
; unsigned char *dst,
|
||||
; int dst_stride
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *above,
|
||||
; unsigned char *left,
|
||||
; int left_stride,
|
||||
; )
|
||||
global sym(vp8_intra_pred_uv_dcleft_mmx2)
|
||||
sym(vp8_intra_pred_uv_dcleft_mmx2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
;arg(2) not used
|
||||
|
||||
; from left
|
||||
mov rsi, arg(2) ;src;
|
||||
movsxd rax, dword ptr arg(3) ;src_stride;
|
||||
dec rsi
|
||||
mov rsi, arg(3) ;left;
|
||||
movsxd rax, dword ptr arg(4) ;left_stride;
|
||||
lea rdi, [rax*3]
|
||||
movzx ecx, byte [rsi]
|
||||
movzx edx, byte [rsi+rax]
|
||||
@@ -310,17 +315,20 @@ sym(vp8_intra_pred_uv_dcleft_mmx2):
|
||||
;void vp8_intra_pred_uv_dc128_mmx(
|
||||
; unsigned char *dst,
|
||||
; int dst_stride
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *above,
|
||||
; unsigned char *left,
|
||||
; int left_stride,
|
||||
; )
|
||||
global sym(vp8_intra_pred_uv_dc128_mmx)
|
||||
sym(vp8_intra_pred_uv_dc128_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
GET_GOT rbx
|
||||
; end prolog
|
||||
|
||||
;arg(2), arg(3), arg(4) not used
|
||||
|
||||
; write out
|
||||
movq mm1, [GLOBAL(dc_128)]
|
||||
mov rax, arg(0) ;dst;
|
||||
@@ -346,15 +354,16 @@ sym(vp8_intra_pred_uv_dc128_mmx):
|
||||
;void vp8_intra_pred_uv_tm_sse2(
|
||||
; unsigned char *dst,
|
||||
; int dst_stride
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *above,
|
||||
; unsigned char *left,
|
||||
; int left_stride,
|
||||
; )
|
||||
%macro vp8_intra_pred_uv_tm 1
|
||||
global sym(vp8_intra_pred_uv_tm_%1)
|
||||
sym(vp8_intra_pred_uv_tm_%1):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
@@ -362,9 +371,8 @@ sym(vp8_intra_pred_uv_tm_%1):
|
||||
|
||||
; read top row
|
||||
mov edx, 4
|
||||
mov rsi, arg(2) ;src;
|
||||
movsxd rax, dword ptr arg(3) ;src_stride;
|
||||
sub rsi, rax
|
||||
mov rsi, arg(2) ;above
|
||||
movsxd rax, dword ptr arg(4) ;left_stride;
|
||||
pxor xmm0, xmm0
|
||||
%ifidn %1, ssse3
|
||||
movdqa xmm2, [GLOBAL(dc_1024)]
|
||||
@@ -374,7 +382,7 @@ sym(vp8_intra_pred_uv_tm_%1):
|
||||
|
||||
; set up left ptrs ans subtract topleft
|
||||
movd xmm3, [rsi-1]
|
||||
lea rsi, [rsi+rax-1]
|
||||
mov rsi, arg(3) ;left;
|
||||
%ifidn %1, sse2
|
||||
punpcklbw xmm3, xmm0
|
||||
pshuflw xmm3, xmm3, 0x0
|
||||
@@ -427,20 +435,22 @@ vp8_intra_pred_uv_tm ssse3
|
||||
;void vp8_intra_pred_uv_ve_mmx(
|
||||
; unsigned char *dst,
|
||||
; int dst_stride
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *above,
|
||||
; unsigned char *left,
|
||||
; int left_stride,
|
||||
; )
|
||||
global sym(vp8_intra_pred_uv_ve_mmx)
|
||||
sym(vp8_intra_pred_uv_ve_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
; end prolog
|
||||
|
||||
; arg(3), arg(4) not used
|
||||
|
||||
; read from top
|
||||
mov rax, arg(2) ;src;
|
||||
movsxd rdx, dword ptr arg(3) ;src_stride;
|
||||
sub rax, rdx
|
||||
|
||||
movq mm1, [rax]
|
||||
|
||||
; write out
|
||||
@@ -466,15 +476,16 @@ sym(vp8_intra_pred_uv_ve_mmx):
|
||||
;void vp8_intra_pred_uv_ho_mmx2(
|
||||
; unsigned char *dst,
|
||||
; int dst_stride
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *above,
|
||||
; unsigned char *left,
|
||||
; int left_stride,
|
||||
; )
|
||||
%macro vp8_intra_pred_uv_ho 1
|
||||
global sym(vp8_intra_pred_uv_ho_%1)
|
||||
sym(vp8_intra_pred_uv_ho_%1):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
push rsi
|
||||
push rdi
|
||||
%ifidn %1, ssse3
|
||||
@@ -485,12 +496,14 @@ sym(vp8_intra_pred_uv_ho_%1):
|
||||
%endif
|
||||
; end prolog
|
||||
|
||||
;arg(2) not used
|
||||
|
||||
; read from left and write out
|
||||
%ifidn %1, mmx2
|
||||
mov edx, 4
|
||||
%endif
|
||||
mov rsi, arg(2) ;src;
|
||||
movsxd rax, dword ptr arg(3) ;src_stride;
|
||||
mov rsi, arg(3) ;left
|
||||
movsxd rax, dword ptr arg(4) ;left_stride;
|
||||
mov rdi, arg(0) ;dst;
|
||||
movsxd rcx, dword ptr arg(1) ;dst_stride
|
||||
%ifidn %1, ssse3
|
||||
@@ -498,7 +511,7 @@ sym(vp8_intra_pred_uv_ho_%1):
|
||||
movdqa xmm2, [GLOBAL(dc_00001111)]
|
||||
lea rbx, [rax*3]
|
||||
%endif
|
||||
dec rsi
|
||||
|
||||
%ifidn %1, mmx2
|
||||
.vp8_intra_pred_uv_ho_%1_loop:
|
||||
movd mm0, [rsi]
|
||||
|
@@ -15,7 +15,8 @@
|
||||
|
||||
#define build_intra_predictors_mbuv_prototype(sym) \
|
||||
void sym(unsigned char *dst, int dst_stride, \
|
||||
const unsigned char *src, int src_stride)
|
||||
const unsigned char *above, \
|
||||
const unsigned char *left, int left_stride)
|
||||
typedef build_intra_predictors_mbuv_prototype((*build_intra_predictors_mbuv_fn_t));
|
||||
|
||||
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc_mmx2);
|
||||
@@ -29,15 +30,19 @@ extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2);
|
||||
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3);
|
||||
|
||||
static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
|
||||
unsigned char * uabove_row,
|
||||
unsigned char * vabove_row,
|
||||
unsigned char *dst_u,
|
||||
unsigned char *dst_v,
|
||||
int dst_stride,
|
||||
unsigned char * uleft,
|
||||
unsigned char * vleft,
|
||||
int left_stride,
|
||||
build_intra_predictors_mbuv_fn_t tm_func,
|
||||
build_intra_predictors_mbuv_fn_t ho_func)
|
||||
{
|
||||
int mode = x->mode_info_context->mbmi.uv_mode;
|
||||
build_intra_predictors_mbuv_fn_t fn;
|
||||
int src_stride = x->dst.uv_stride;
|
||||
|
||||
switch (mode) {
|
||||
case V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break;
|
||||
@@ -59,38 +64,48 @@ static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
|
||||
default: return;
|
||||
}
|
||||
|
||||
fn(dst_u, dst_stride, x->dst.u_buffer, src_stride);
|
||||
fn(dst_v, dst_stride, x->dst.v_buffer, src_stride);
|
||||
fn(dst_u, dst_stride, uabove_row, uleft, left_stride);
|
||||
fn(dst_v, dst_stride, vabove_row, vleft, left_stride);
|
||||
}
|
||||
|
||||
void vp8_build_intra_predictors_mbuv_sse2(MACROBLOCKD *x)
|
||||
void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x,
|
||||
unsigned char * uabove_row,
|
||||
unsigned char * vabove_row,
|
||||
unsigned char * uleft,
|
||||
unsigned char * vleft,
|
||||
int left_stride,
|
||||
unsigned char * upred_ptr,
|
||||
unsigned char * vpred_ptr,
|
||||
int pred_stride)
|
||||
{
|
||||
vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
|
||||
&x->predictor[320], 8,
|
||||
vp8_build_intra_predictors_mbuv_x86(x,
|
||||
uabove_row, vabove_row,
|
||||
upred_ptr,
|
||||
vpred_ptr, pred_stride,
|
||||
uleft,
|
||||
vleft,
|
||||
left_stride,
|
||||
vp8_intra_pred_uv_tm_sse2,
|
||||
vp8_intra_pred_uv_ho_mmx2);
|
||||
}
|
||||
|
||||
void vp8_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *x)
|
||||
void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x,
|
||||
unsigned char * uabove_row,
|
||||
unsigned char * vabove_row,
|
||||
unsigned char * uleft,
|
||||
unsigned char * vleft,
|
||||
int left_stride,
|
||||
unsigned char * upred_ptr,
|
||||
unsigned char * vpred_ptr,
|
||||
int pred_stride)
|
||||
{
|
||||
vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
|
||||
&x->predictor[320], 8,
|
||||
vp8_intra_pred_uv_tm_ssse3,
|
||||
vp8_intra_pred_uv_ho_ssse3);
|
||||
}
|
||||
|
||||
void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x)
|
||||
{
|
||||
vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
|
||||
x->dst.v_buffer, x->dst.uv_stride,
|
||||
vp8_intra_pred_uv_tm_sse2,
|
||||
vp8_intra_pred_uv_ho_mmx2);
|
||||
}
|
||||
|
||||
void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x)
|
||||
{
|
||||
vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
|
||||
x->dst.v_buffer, x->dst.uv_stride,
|
||||
vp8_build_intra_predictors_mbuv_x86(x,
|
||||
uabove_row, vabove_row,
|
||||
upred_ptr,
|
||||
vpred_ptr, pred_stride,
|
||||
uleft,
|
||||
vleft,
|
||||
left_stride,
|
||||
vp8_intra_pred_uv_tm_ssse3,
|
||||
vp8_intra_pred_uv_ho_ssse3);
|
||||
}
|
||||
@@ -132,22 +147,10 @@ static void vp8_build_intra_predictors_mby_x86(MACROBLOCKD *x,
|
||||
default: return;
|
||||
}
|
||||
|
||||
fn(dst_y, dst_stride, x->dst.y_buffer, src_stride);
|
||||
// fn(dst_y, dst_stride, x->dst.y_buffer, src_stride);
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_build_intra_predictors_mby_sse2(MACROBLOCKD *x)
|
||||
{
|
||||
vp8_build_intra_predictors_mby_x86(x, x->predictor, 16,
|
||||
vp8_intra_pred_y_tm_sse2);
|
||||
}
|
||||
|
||||
void vp8_build_intra_predictors_mby_ssse3(MACROBLOCKD *x)
|
||||
{
|
||||
vp8_build_intra_predictors_mby_x86(x, x->predictor, 16,
|
||||
vp8_intra_pred_y_tm_ssse3);
|
||||
}
|
||||
|
||||
void vp8_build_intra_predictors_mby_s_sse2(MACROBLOCKD *x)
|
||||
{
|
||||
vp8_build_intra_predictors_mby_x86(x, x->dst.y_buffer, x->dst.y_stride,
|
||||
|
Reference in New Issue
Block a user