remove one set of 16x16 variance funcations

call to this set of functions are replaced by var16x16.

Change-Id: I5ff1effc6c1358ea06cda1517b88ec28ef551b0d
This commit is contained in:
Yaowu Xu 2011-06-08 13:45:29 -07:00
parent af49c11250
commit 361717d2be
13 changed files with 9 additions and 382 deletions

View File

@ -53,8 +53,7 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.mse16x16 = vp8_mse16x16_armv6;
/*cpi->rtcd.variance.getmbss = vp8_get_mb_ss_c;*/
/*cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_c;
cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_c;*/
/*cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_c;*/
/*cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;*/
@ -101,7 +100,6 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.mse16x16 = vp8_mse16x16_neon;
/*cpi->rtcd.variance.getmbss = vp8_get_mb_ss_c;*/
cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_neon;
cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_neon;
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_neon;

View File

@ -10,7 +10,6 @@
EXPORT |vp8_mse16x16_neon|
EXPORT |vp8_get16x16pred_error_neon|
EXPORT |vp8_get4x4sse_cs_neon|
ARM
@ -76,62 +75,6 @@ mse16x16_neon_loop
ENDP
;============================
; r0 unsigned char *src_ptr
; r1 int src_stride
; r2 unsigned char *ref_ptr
; r3 int ref_stride
|vp8_get16x16pred_error_neon| PROC
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - pred_error
vmov.i8 q10, #0
mov r12, #8
get16x16pred_error_neon_loop
vld1.8 {q0}, [r0], r1 ;Load up source and reference
vld1.8 {q2}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q3}, [r2], r3
vsubl.u8 q11, d0, d4
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vpadal.s16 q8, q11
vmlal.s16 q9, d22, d22
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne get16x16pred_error_neon_loop
vadd.u32 q10, q9, q10
vpaddl.s32 q0, q8
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vshr.s32 d10, d10, #8
vsub.s32 d0, d1, d10
vmov.32 r0, d0[0]
bx lr
ENDP
;=============================
; r0 unsigned char *src_ptr,

View File

@ -83,7 +83,6 @@ extern prototype_variance(vp8_variance_halfpixvar16x16_hv_neon);
//extern prototype_getmbss(vp8_get_mb_ss_c);
extern prototype_variance(vp8_mse16x16_neon);
extern prototype_get16x16prederror(vp8_get16x16pred_error_neon);
extern prototype_get16x16prederror(vp8_get4x4sse_cs_neon);
#if !CONFIG_RUNTIME_CPU_DETECT
@ -147,9 +146,6 @@ extern prototype_get16x16prederror(vp8_get4x4sse_cs_neon);
#undef vp8_variance_mse16x16
#define vp8_variance_mse16x16 vp8_mse16x16_neon
#undef vp8_variance_get16x16prederror
#define vp8_variance_get16x16prederror vp8_get16x16pred_error_neon
#undef vp8_variance_get4x4sse_cs
#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_neon
#endif

View File

@ -67,7 +67,6 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
cpi->rtcd.variance.mse16x16 = vp8_mse16x16_c;
cpi->rtcd.variance.getmbss = vp8_get_mb_ss_c;
cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_c;
cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_c;
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;

View File

@ -43,7 +43,6 @@ extern const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES];
extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
extern unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
extern unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
extern int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *best_ref_mv, int best_rd, int *, int *, int *, int, int *mvcost[2], int, int fullpixel);
extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);
@ -98,37 +97,6 @@ static int get_inter_mbpred_error(MACROBLOCK *mb,
}
unsigned int vp8_get16x16pred_error_c
(
const unsigned char *src_ptr,
int src_stride,
const unsigned char *ref_ptr,
int ref_stride
)
{
unsigned pred_error = 0;
int i, j;
int sum = 0;
for (i = 0; i < 16; i++)
{
int diff;
for (j = 0; j < 16; j++)
{
diff = src_ptr[j] - ref_ptr[j];
sum += diff;
pred_error += diff * diff;
}
src_ptr += src_stride;
ref_ptr += ref_stride;
}
pred_error -= sum * sum / 256;
return pred_error;
}
unsigned int vp8_get4x4sse_cs_c
(
@ -669,9 +637,9 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
{
rate2 += rate;
distortion2 = VARIANCE_INVOKE
(&cpi->rtcd.variance, get16x16prederror)(
(&cpi->rtcd.variance, var16x16)(
x->src.y_buffer, x->src.y_stride,
x->e_mbd.predictor, 16);
x->e_mbd.predictor, 16, &sse);
this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
if (this_rd < best_intra_rd)
@ -694,7 +662,9 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
case TM_PRED:
RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
(&x->e_mbd);
distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16);
distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
(x->src.y_buffer, x->src.y_stride,
x->e_mbd.predictor, 16, &sse);
rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
@ -960,6 +930,7 @@ void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_)
int rate, best_rate = 0, distortion, best_distortion;
MB_PREDICTION_MODE mode, best_mode = DC_PRED;
int this_rd;
unsigned int sse;
x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
@ -970,8 +941,8 @@ void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_)
x->e_mbd.mode_info_context->mbmi.mode = mode;
RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
(&x->e_mbd);
distortion = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)
(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16);
distortion = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, &sse);
rate = x->mbmode_cost[x->e_mbd.frame_type][mode];
this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

View File

@ -48,7 +48,6 @@ void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, i
void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
// c imports
@ -86,7 +85,6 @@ extern sub_pixel_variance_function sub_pixel_variance16x8_c;
extern sub_pixel_variance_function sub_pixel_variance16x16_c;
extern unsigned int vp8_get_mb_ss_c(short *);
extern unsigned int vp8_get16x16pred_error_c(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
// ppc
@ -145,7 +143,6 @@ void vp8_cmachine_specific_config(void)
vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ppc;
vp8_get_mb_ss = vp8_get_mb_ss_c;
vp8_get16x16pred_error = vp8_get16x16pred_error_c;
vp8_get4x4sse_cs = vp8_get4x4sse_cs_c;
vp8_sad16x16 = vp8_sad16x16_ppc;

View File

@ -308,11 +308,6 @@ extern prototype_getmbss(vp8_variance_getmbss);
#endif
extern prototype_variance(vp8_variance_mse16x16);
#ifndef vp8_variance_get16x16prederror
#define vp8_variance_get16x16prederror vp8_get16x16pred_error_c
#endif
extern prototype_get16x16prederror(vp8_variance_get16x16prederror);
#ifndef vp8_variance_get4x4sse_cs
#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_c
#endif
@ -366,7 +361,6 @@ typedef struct
vp8_getmbss_fn_t getmbss;
vp8_variance_fn_t mse16x16;
vp8_get16x16prederror_fn_t get16x16prederror;
vp8_get16x16prederror_fn_t get4x4sse_cs;
vp8_sad_multi_fn_t sad16x16x3;

View File

@ -843,136 +843,6 @@ filter_block2d_bil_var_mmx_loop:
pop rbp
ret
;unsigned int vp8_get16x16pred_error_mmx
;(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride
;)
global sym(vp8_get16x16pred_error_mmx)
sym(vp8_get16x16pred_error_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
GET_GOT rbx
push rsi
push rdi
sub rsp, 16
; end prolog
mov rsi, arg(0) ;DWORD PTR [src_ptr]
mov rdi, arg(2) ;DWORD PTR [ref_ptr]
movsxd rax, DWORD PTR arg(1) ;[src_stride]
movsxd rdx, DWORD PTR arg(3) ;[ref_stride]
pxor mm0, mm0 ; clear xmm0 for unpack
pxor mm7, mm7 ; clear xmm7 for accumulating diffs
pxor mm6, mm6 ; clear xmm6 for accumulating sse
mov rcx, 16
var16loop:
movq mm1, [rsi]
movq mm2, [rdi]
movq mm3, mm1
movq mm4, mm2
punpcklbw mm1, mm0
punpckhbw mm3, mm0
punpcklbw mm2, mm0
punpckhbw mm4, mm0
psubw mm1, mm2
psubw mm3, mm4
paddw mm7, mm1
pmaddwd mm1, mm1
paddw mm7, mm3
pmaddwd mm3, mm3
paddd mm6, mm1
paddd mm6, mm3
movq mm1, [rsi+8]
movq mm2, [rdi+8]
movq mm3, mm1
movq mm4, mm2
punpcklbw mm1, mm0
punpckhbw mm3, mm0
punpcklbw mm2, mm0
punpckhbw mm4, mm0
psubw mm1, mm2
psubw mm3, mm4
paddw mm7, mm1
pmaddwd mm1, mm1
paddw mm7, mm3
pmaddwd mm3, mm3
paddd mm6, mm1
paddd mm6, mm3
add rsi, rax
add rdi, rdx
sub rcx, 1
jnz var16loop
movq mm1, mm6
pxor mm6, mm6
pxor mm5, mm5
punpcklwd mm6, mm7
punpckhwd mm5, mm7
psrad mm5, 16
psrad mm6, 16
paddd mm6, mm5
movq mm2, mm1
psrlq mm1, 32
paddd mm2, mm1
movq mm7, mm6
psrlq mm6, 32
paddd mm6, mm7
movd DWORD PTR [rsp], mm6 ;Sum
movd DWORD PTR [rsp+4], mm2 ;SSE
; return (SSE-((Sum*Sum)>>8));
movsxd rdx, dword ptr [rsp]
imul rdx, rdx
sar rdx, 8
movsxd rax, dword ptr [rsp + 4]
sub rax, rdx
; begin epilog
add rsp, 16
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
;short mmx_bi_rd[4] = { 64, 64, 64, 64};

View File

@ -213,122 +213,6 @@ var16loop:
ret
;unsigned int vp8_get16x16pred_error_sse2
;(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride
;)
global sym(vp8_get16x16pred_error_sse2)
sym(vp8_get16x16pred_error_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
sub rsp, 16
; end prolog
mov rsi, arg(0) ;[src_ptr]
mov rdi, arg(2) ;[ref_ptr]
movsxd rax, DWORD PTR arg(1) ;[src_stride]
movsxd rdx, DWORD PTR arg(3) ;[ref_stride]
pxor xmm0, xmm0 ; clear xmm0 for unpack
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
mov rcx, 16
var16peloop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rdi]
movdqa xmm3, xmm1
movdqa xmm4, xmm2
punpcklbw xmm1, xmm0
punpckhbw xmm3, xmm0
punpcklbw xmm2, xmm0
punpckhbw xmm4, xmm0
psubw xmm1, xmm2
psubw xmm3, xmm4
paddw xmm7, xmm1
pmaddwd xmm1, xmm1
paddw xmm7, xmm3
pmaddwd xmm3, xmm3
paddd xmm6, xmm1
paddd xmm6, xmm3
add rsi, rax
add rdi, rdx
sub rcx, 1
jnz var16peloop
movdqa xmm1, xmm6
pxor xmm6, xmm6
pxor xmm5, xmm5
punpcklwd xmm6, xmm7
punpckhwd xmm5, xmm7
psrad xmm5, 16
psrad xmm6, 16
paddd xmm6, xmm5
movdqa xmm2, xmm1
punpckldq xmm1, xmm0
punpckhdq xmm2, xmm0
movdqa xmm7, xmm6
paddd xmm1, xmm2
punpckldq xmm6, xmm0
punpckhdq xmm7, xmm0
paddd xmm6, xmm7
movdqa xmm2, xmm1
movdqa xmm7, xmm6
psrldq xmm1, 8
psrldq xmm6, 8
paddd xmm7, xmm6
paddd xmm1, xmm2
movd DWORD PTR [rsp], xmm7 ;Sum
movd DWORD PTR [rsp+4], xmm1 ;SSE
; return (SSE-((Sum*Sum)>>8));
movsxd rdx, dword ptr [rsp]
imul rdx, rdx
sar rdx, 8
movsxd rax, dword ptr [rsp + 4]
sub rax, rdx
; begin epilog
add rsp, 16
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp8_get8x8var_sse2

View File

@ -76,13 +76,6 @@ extern void vp8_filter_block2d_bil_var_mmx
int *sum,
unsigned int *sumsquared
);
extern unsigned int vp8_get16x16pred_error_mmx
(
const unsigned char *src_ptr,
int src_stride,
const unsigned char *ref_ptr,
int ref_stride
);
unsigned int vp8_variance4x4_mmx(

View File

@ -53,13 +53,6 @@ unsigned int vp8_get16x16var_sse2
unsigned int *SSE,
int *Sum
);
unsigned int vp8_get16x16pred_error_sse2
(
const unsigned char *src_ptr,
int src_stride,
const unsigned char *ref_ptr,
int ref_stride
);
unsigned int vp8_get8x8var_sse2
(
const unsigned char *src_ptr,

View File

@ -41,7 +41,6 @@ extern prototype_variance(vp8_variance_halfpixvar16x16_hv_mmx);
extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_mmx);
extern prototype_getmbss(vp8_get_mb_ss_mmx);
extern prototype_variance(vp8_mse16x16_mmx);
extern prototype_get16x16prederror(vp8_get16x16pred_error_mmx);
extern prototype_variance2(vp8_get8x8var_mmx);
extern prototype_get16x16prederror(vp8_get4x4sse_cs_mmx);
@ -109,9 +108,6 @@ extern prototype_get16x16prederror(vp8_get4x4sse_cs_mmx);
#undef vp8_variance_mse16x16
#define vp8_variance_mse16x16 vp8_mse16x16_mmx
#undef vp8_variance_get16x16prederror
#define vp8_variance_get16x16prederror vp8_get16x16pred_error_mmx
#undef vp8_variance_get4x4sse_cs
#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_mmx
@ -141,7 +137,6 @@ extern prototype_variance(vp8_variance_halfpixvar16x16_hv_wmt);
extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_wmt);
extern prototype_getmbss(vp8_get_mb_ss_sse2);
extern prototype_variance(vp8_mse16x16_wmt);
extern prototype_get16x16prederror(vp8_get16x16pred_error_sse2);
extern prototype_variance2(vp8_get8x8var_sse2);
extern prototype_variance2(vp8_get16x16var_sse2);
@ -209,9 +204,6 @@ extern prototype_variance2(vp8_get16x16var_sse2);
#undef vp8_variance_mse16x16
#define vp8_variance_mse16x16 vp8_mse16x16_wmt
#undef vp8_variance_get16x16prederror
#define vp8_variance_get16x16prederror vp8_get16x16pred_error_sse2
#endif
#endif

View File

@ -175,7 +175,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.mse16x16 = vp8_mse16x16_mmx;
cpi->rtcd.variance.getmbss = vp8_get_mb_ss_mmx;
cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_mmx;
cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx;
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx;
@ -224,8 +223,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.mse16x16 = vp8_mse16x16_wmt;
cpi->rtcd.variance.getmbss = vp8_get_mb_ss_sse2;
cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_sse2;
/* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */;
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_sse2;