Updated vp8_build_intra_predictors_mby_s(sse2/ssse3)
to work with the latest code. Patch Set 2: aligned the above_row buffers to fix crash Change-Id: I7a6992a20ed079ccd302f8c26215cf3057f8b70c
This commit is contained in:
@@ -14,7 +14,7 @@
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
#include "blockd.h"
|
||||
|
||||
void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x,
|
||||
void vp8_build_intra_predictors_mby_s_c(MACROBLOCKD *x,
|
||||
unsigned char * yabove_row,
|
||||
unsigned char * yleft,
|
||||
int left_stride,
|
||||
|
||||
@@ -123,7 +123,8 @@ specialize vp8_copy_mem8x4 mmx media neon
|
||||
vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6
|
||||
|
||||
prototype void vp8_build_intra_predictors_mby_s "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride"
|
||||
#TODO: fix assembly --- specialize vp8_build_intra_predictors_mby_s sse2 ssse3 neon
|
||||
specialize vp8_build_intra_predictors_mby_s sse2 ssse3
|
||||
#TODO: fix assembly for neon
|
||||
|
||||
prototype void vp8_build_intra_predictors_mbuv_s "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride"
|
||||
specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3
|
||||
|
||||
@@ -133,22 +133,20 @@ sym(vp8_intra_pred_uv_dc_mmx2):
|
||||
; end prolog
|
||||
|
||||
; from top
|
||||
mov rsi, arg(2) ;above;
|
||||
pxor mm0, mm0
|
||||
movq mm1, [rsi]
|
||||
psadbw mm1, mm0
|
||||
|
||||
; from left
|
||||
mov rdi, arg(2) ;above;
|
||||
mov rsi, arg(3) ;left;
|
||||
movsxd rax, dword ptr arg(4) ;left_stride;
|
||||
pxor mm0, mm0
|
||||
movq mm1, [rdi]
|
||||
lea rdi, [rax*3]
|
||||
psadbw mm1, mm0
|
||||
; from left
|
||||
movzx ecx, byte [rsi]
|
||||
movzx edx, byte [rsi+rax*1]
|
||||
add ecx, edx
|
||||
movzx edx, byte [rsi+rax*2]
|
||||
add ecx, edx
|
||||
|
||||
|
||||
movzx edx, byte [rsi+rdi]
|
||||
lea rsi, [rsi+rax*4]
|
||||
add ecx, edx
|
||||
@@ -166,23 +164,23 @@ sym(vp8_intra_pred_uv_dc_mmx2):
|
||||
lea edx, [edx+ecx+8]
|
||||
sar edx, 4
|
||||
movd mm1, edx
|
||||
movsxd rcx, dword ptr arg(1) ;dst_stride
|
||||
pshufw mm1, mm1, 0x0
|
||||
mov rdi, arg(0) ;dst;
|
||||
packuswb mm1, mm1
|
||||
|
||||
; write out
|
||||
mov rdi, arg(0) ;dst;
|
||||
movsxd rcx, dword ptr arg(1) ;dst_stride
|
||||
lea rax, [rcx*3]
|
||||
lea rdx, [rdi+rcx*4]
|
||||
|
||||
movq [rdi ], mm1
|
||||
movq [rdi+rcx ], mm1
|
||||
movq [rdi+rcx*2], mm1
|
||||
movq [rdi+rax ], mm1
|
||||
lea rdi, [rdi+rcx*4]
|
||||
movq [rdi ], mm1
|
||||
movq [rdi+rcx ], mm1
|
||||
movq [rdi+rcx*2], mm1
|
||||
movq [rdi+rax ], mm1
|
||||
movq [rdx ], mm1
|
||||
movq [rdx+rcx ], mm1
|
||||
movq [rdx+rcx*2], mm1
|
||||
movq [rdx+rax ], mm1
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
@@ -478,7 +476,7 @@ sym(vp8_intra_pred_uv_ve_mmx):
|
||||
; int dst_stride
|
||||
; unsigned char *above,
|
||||
; unsigned char *left,
|
||||
; int left_stride,
|
||||
; int left_stride
|
||||
; )
|
||||
%macro vp8_intra_pred_uv_ho 1
|
||||
global sym(vp8_intra_pred_uv_ho_%1)
|
||||
@@ -575,38 +573,43 @@ vp8_intra_pred_uv_ho ssse3
|
||||
;void vp8_intra_pred_y_dc_sse2(
|
||||
; unsigned char *dst,
|
||||
; int dst_stride
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *above,
|
||||
; unsigned char *left,
|
||||
; int left_stride
|
||||
; )
|
||||
global sym(vp8_intra_pred_y_dc_sse2)
|
||||
sym(vp8_intra_pred_y_dc_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
; from top
|
||||
mov rsi, arg(2) ;src;
|
||||
movsxd rax, dword ptr arg(3) ;src_stride;
|
||||
sub rsi, rax
|
||||
mov rdi, arg(2) ;above
|
||||
mov rsi, arg(3) ;left
|
||||
movsxd rax, dword ptr arg(4) ;left_stride;
|
||||
|
||||
pxor xmm0, xmm0
|
||||
movdqa xmm1, [rsi]
|
||||
movdqa xmm1, [rdi]
|
||||
psadbw xmm1, xmm0
|
||||
movq xmm2, xmm1
|
||||
punpckhqdq xmm1, xmm1
|
||||
paddw xmm1, xmm2
|
||||
|
||||
; from left
|
||||
dec rsi
|
||||
lea rdi, [rax*3]
|
||||
movzx ecx, byte [rsi+rax]
|
||||
|
||||
movzx ecx, byte [rsi]
|
||||
movzx edx, byte [rsi+rax]
|
||||
add ecx, edx
|
||||
movzx edx, byte [rsi+rax*2]
|
||||
add ecx, edx
|
||||
movzx edx, byte [rsi+rdi]
|
||||
add ecx, edx
|
||||
lea rsi, [rsi+rax*4]
|
||||
|
||||
movzx edx, byte [rsi]
|
||||
add ecx, edx
|
||||
movzx edx, byte [rsi+rax]
|
||||
@@ -616,6 +619,7 @@ sym(vp8_intra_pred_y_dc_sse2):
|
||||
movzx edx, byte [rsi+rdi]
|
||||
add ecx, edx
|
||||
lea rsi, [rsi+rax*4]
|
||||
|
||||
movzx edx, byte [rsi]
|
||||
add ecx, edx
|
||||
movzx edx, byte [rsi+rax]
|
||||
@@ -625,6 +629,7 @@ sym(vp8_intra_pred_y_dc_sse2):
|
||||
movzx edx, byte [rsi+rdi]
|
||||
add ecx, edx
|
||||
lea rsi, [rsi+rax*4]
|
||||
|
||||
movzx edx, byte [rsi]
|
||||
add ecx, edx
|
||||
movzx edx, byte [rsi+rax]
|
||||
@@ -633,8 +638,6 @@ sym(vp8_intra_pred_y_dc_sse2):
|
||||
add ecx, edx
|
||||
movzx edx, byte [rsi+rdi]
|
||||
add ecx, edx
|
||||
movzx edx, byte [rsi+rax*4]
|
||||
add ecx, edx
|
||||
|
||||
; add up
|
||||
pextrw edx, xmm1, 0x0
|
||||
@@ -676,22 +679,23 @@ sym(vp8_intra_pred_y_dc_sse2):
|
||||
;void vp8_intra_pred_y_dctop_sse2(
|
||||
; unsigned char *dst,
|
||||
; int dst_stride
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *above,
|
||||
; unsigned char *left,
|
||||
; int left_stride
|
||||
; )
|
||||
global sym(vp8_intra_pred_y_dctop_sse2)
|
||||
sym(vp8_intra_pred_y_dctop_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
push rsi
|
||||
GET_GOT rbx
|
||||
; end prolog
|
||||
|
||||
;arg(3), arg(4) not used
|
||||
|
||||
; from top
|
||||
mov rcx, arg(2) ;src;
|
||||
movsxd rax, dword ptr arg(3) ;src_stride;
|
||||
sub rcx, rax
|
||||
mov rcx, arg(2) ;above;
|
||||
pxor xmm0, xmm0
|
||||
movdqa xmm1, [rcx]
|
||||
psadbw xmm1, xmm0
|
||||
@@ -737,22 +741,25 @@ sym(vp8_intra_pred_y_dctop_sse2):
|
||||
;void vp8_intra_pred_y_dcleft_sse2(
|
||||
; unsigned char *dst,
|
||||
; int dst_stride
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *above,
|
||||
; unsigned char *left,
|
||||
; int left_stride
|
||||
; )
|
||||
global sym(vp8_intra_pred_y_dcleft_sse2)
|
||||
sym(vp8_intra_pred_y_dcleft_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
;arg(2) not used
|
||||
|
||||
; from left
|
||||
mov rsi, arg(2) ;src;
|
||||
movsxd rax, dword ptr arg(3) ;src_stride;
|
||||
dec rsi
|
||||
mov rsi, arg(3) ;left;
|
||||
movsxd rax, dword ptr arg(4) ;left_stride;
|
||||
|
||||
lea rdi, [rax*3]
|
||||
movzx ecx, byte [rsi]
|
||||
movzx edx, byte [rsi+rax]
|
||||
@@ -827,18 +834,21 @@ sym(vp8_intra_pred_y_dcleft_sse2):
|
||||
;void vp8_intra_pred_y_dc128_sse2(
|
||||
; unsigned char *dst,
|
||||
; int dst_stride
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *above,
|
||||
; unsigned char *left,
|
||||
; int left_stride
|
||||
; )
|
||||
global sym(vp8_intra_pred_y_dc128_sse2)
|
||||
sym(vp8_intra_pred_y_dc128_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
push rsi
|
||||
GET_GOT rbx
|
||||
; end prolog
|
||||
|
||||
;arg(2), arg(3), arg(4) not used
|
||||
|
||||
; write out
|
||||
mov rsi, 2
|
||||
movdqa xmm1, [GLOBAL(dc_128)]
|
||||
@@ -870,15 +880,16 @@ sym(vp8_intra_pred_y_dc128_sse2):
|
||||
;void vp8_intra_pred_y_tm_sse2(
|
||||
; unsigned char *dst,
|
||||
; int dst_stride
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *above,
|
||||
; unsigned char *left,
|
||||
; int left_stride
|
||||
; )
|
||||
%macro vp8_intra_pred_y_tm 1
|
||||
global sym(vp8_intra_pred_y_tm_%1)
|
||||
sym(vp8_intra_pred_y_tm_%1):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
push rsi
|
||||
push rdi
|
||||
GET_GOT rbx
|
||||
@@ -886,9 +897,8 @@ sym(vp8_intra_pred_y_tm_%1):
|
||||
|
||||
; read top row
|
||||
mov edx, 8
|
||||
mov rsi, arg(2) ;src;
|
||||
movsxd rax, dword ptr arg(3) ;src_stride;
|
||||
sub rsi, rax
|
||||
mov rsi, arg(2) ;above
|
||||
movsxd rax, dword ptr arg(4) ;left_stride;
|
||||
pxor xmm0, xmm0
|
||||
%ifidn %1, ssse3
|
||||
movdqa xmm3, [GLOBAL(dc_1024)]
|
||||
@@ -900,7 +910,7 @@ sym(vp8_intra_pred_y_tm_%1):
|
||||
|
||||
; set up left ptrs ans subtract topleft
|
||||
movd xmm4, [rsi-1]
|
||||
lea rsi, [rsi+rax-1]
|
||||
mov rsi, arg(3) ;left
|
||||
%ifidn %1, sse2
|
||||
punpcklbw xmm4, xmm0
|
||||
pshuflw xmm4, xmm4, 0x0
|
||||
@@ -958,27 +968,29 @@ vp8_intra_pred_y_tm ssse3
|
||||
;void vp8_intra_pred_y_ve_sse2(
|
||||
; unsigned char *dst,
|
||||
; int dst_stride
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *above,
|
||||
; unsigned char *left,
|
||||
; int left_stride
|
||||
; )
|
||||
global sym(vp8_intra_pred_y_ve_sse2)
|
||||
sym(vp8_intra_pred_y_ve_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
push rsi
|
||||
; end prolog
|
||||
|
||||
;arg(3), arg(4) not used
|
||||
|
||||
mov rax, arg(2) ;above;
|
||||
mov rsi, 2
|
||||
movsxd rdx, dword ptr arg(1) ;dst_stride
|
||||
|
||||
; read from top
|
||||
mov rax, arg(2) ;src;
|
||||
movsxd rdx, dword ptr arg(3) ;src_stride;
|
||||
sub rax, rdx
|
||||
movdqa xmm1, [rax]
|
||||
|
||||
; write out
|
||||
mov rsi, 2
|
||||
mov rax, arg(0) ;dst;
|
||||
movsxd rdx, dword ptr arg(1) ;dst_stride
|
||||
lea rcx, [rdx*3]
|
||||
|
||||
.label
|
||||
@@ -1004,25 +1016,27 @@ sym(vp8_intra_pred_y_ve_sse2):
|
||||
;void vp8_intra_pred_y_ho_sse2(
|
||||
; unsigned char *dst,
|
||||
; int dst_stride
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *above,
|
||||
; unsigned char *left,
|
||||
; int left_stride,
|
||||
; )
|
||||
global sym(vp8_intra_pred_y_ho_sse2)
|
||||
sym(vp8_intra_pred_y_ho_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
;arg(2) not used
|
||||
|
||||
; read from left and write out
|
||||
mov edx, 8
|
||||
mov rsi, arg(2) ;src;
|
||||
movsxd rax, dword ptr arg(3) ;src_stride;
|
||||
mov rsi, arg(3) ;left;
|
||||
movsxd rax, dword ptr arg(4) ;left_stride;
|
||||
mov rdi, arg(0) ;dst;
|
||||
movsxd rcx, dword ptr arg(1) ;dst_stride
|
||||
dec rsi
|
||||
|
||||
vp8_intra_pred_y_ho_sse2_loop:
|
||||
movd xmm0, [rsi]
|
||||
|
||||
@@ -110,23 +110,32 @@ void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x,
|
||||
vp8_intra_pred_uv_ho_ssse3);
|
||||
}
|
||||
|
||||
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dc_sse2);
|
||||
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dctop_sse2);
|
||||
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dcleft_sse2);
|
||||
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dc128_sse2);
|
||||
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_ho_sse2);
|
||||
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_ve_sse2);
|
||||
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_tm_sse2);
|
||||
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_tm_ssse3);
|
||||
#define build_intra_predictors_mby_prototype(sym) \
|
||||
void sym(unsigned char *dst, int dst_stride, \
|
||||
const unsigned char *above, \
|
||||
const unsigned char *left, int left_stride)
|
||||
typedef build_intra_predictors_mby_prototype((*build_intra_predictors_mby_fn_t));
|
||||
|
||||
extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dc_sse2);
|
||||
extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dctop_sse2);
|
||||
extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dcleft_sse2);
|
||||
extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dc128_sse2);
|
||||
extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_ho_sse2);
|
||||
extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_ve_sse2);
|
||||
extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_tm_sse2);
|
||||
extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_tm_ssse3);
|
||||
|
||||
static void vp8_build_intra_predictors_mby_x86(MACROBLOCKD *x,
|
||||
unsigned char * yabove_row,
|
||||
unsigned char *dst_y,
|
||||
int dst_stride,
|
||||
build_intra_predictors_mbuv_fn_t tm_func)
|
||||
unsigned char * yleft,
|
||||
int left_stride,
|
||||
build_intra_predictors_mby_fn_t tm_func)
|
||||
{
|
||||
int mode = x->mode_info_context->mbmi.mode;
|
||||
build_intra_predictors_mbuv_fn_t fn;
|
||||
int src_stride = x->dst.y_stride;
|
||||
|
||||
switch (mode) {
|
||||
case V_PRED: fn = vp8_intra_pred_y_ve_sse2; break;
|
||||
case H_PRED: fn = vp8_intra_pred_y_ho_sse2; break;
|
||||
@@ -147,19 +156,31 @@ static void vp8_build_intra_predictors_mby_x86(MACROBLOCKD *x,
|
||||
default: return;
|
||||
}
|
||||
|
||||
// fn(dst_y, dst_stride, x->dst.y_buffer, src_stride);
|
||||
fn(dst_y, dst_stride, yabove_row, yleft, left_stride);
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_build_intra_predictors_mby_s_sse2(MACROBLOCKD *x)
|
||||
void vp8_build_intra_predictors_mby_s_sse2(MACROBLOCKD *x,
|
||||
unsigned char * yabove_row,
|
||||
unsigned char * yleft,
|
||||
int left_stride,
|
||||
unsigned char * ypred_ptr,
|
||||
int y_stride)
|
||||
{
|
||||
vp8_build_intra_predictors_mby_x86(x, x->dst.y_buffer, x->dst.y_stride,
|
||||
vp8_build_intra_predictors_mby_x86(x, yabove_row, ypred_ptr,
|
||||
y_stride, yleft, left_stride,
|
||||
vp8_intra_pred_y_tm_sse2);
|
||||
}
|
||||
|
||||
void vp8_build_intra_predictors_mby_s_ssse3(MACROBLOCKD *x)
|
||||
void vp8_build_intra_predictors_mby_s_ssse3(MACROBLOCKD *x,
|
||||
unsigned char * yabove_row,
|
||||
unsigned char * yleft,
|
||||
int left_stride,
|
||||
unsigned char * ypred_ptr,
|
||||
int y_stride)
|
||||
{
|
||||
vp8_build_intra_predictors_mby_x86(x, x->dst.y_buffer, x->dst.y_stride,
|
||||
vp8_build_intra_predictors_mby_x86(x, yabove_row, ypred_ptr,
|
||||
y_stride, yleft, left_stride,
|
||||
vp8_intra_pred_y_tm_ssse3);
|
||||
|
||||
}
|
||||
|
||||
@@ -815,15 +815,15 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
|
||||
/* Allocate memory for above_row buffers. */
|
||||
CHECK_MEM_ERROR(pbi->mt_yabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
|
||||
for (i=0; i< pc->mb_rows; i++)
|
||||
CHECK_MEM_ERROR(pbi->mt_yabove_row[i], vpx_calloc(sizeof(unsigned char) * (width + (VP8BORDERINPIXELS<<1)), 1));
|
||||
CHECK_MEM_ERROR(pbi->mt_yabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (width + (VP8BORDERINPIXELS<<1))));
|
||||
|
||||
CHECK_MEM_ERROR(pbi->mt_uabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
|
||||
for (i=0; i< pc->mb_rows; i++)
|
||||
CHECK_MEM_ERROR(pbi->mt_uabove_row[i], vpx_calloc(sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS), 1));
|
||||
CHECK_MEM_ERROR(pbi->mt_uabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
|
||||
|
||||
CHECK_MEM_ERROR(pbi->mt_vabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
|
||||
for (i=0; i< pc->mb_rows; i++)
|
||||
CHECK_MEM_ERROR(pbi->mt_vabove_row[i], vpx_calloc(sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS), 1));
|
||||
CHECK_MEM_ERROR(pbi->mt_vabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
|
||||
|
||||
/* Allocate memory for left_col buffers. */
|
||||
CHECK_MEM_ERROR(pbi->mt_yleft_col, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
|
||||
|
||||
Reference in New Issue
Block a user