From 75b33c68c75f3d35e7b409aff87f12b307aa096b Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 9 Jul 2013 14:54:20 -0700 Subject: [PATCH] SSE/SSE2 assembly for 4x4/8x8/16x16/32x32 V intra prediction. Change-Id: I55a6cfa2daba738cbc0c4a02f806893f7e556997 --- vp9/common/vp9_rtcd_defs.sh | 8 ++-- vp9/common/x86/vp9_intrapred_sse2.asm | 63 +++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 4 deletions(-) diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index c5e108722..313cfd6b5 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -74,7 +74,7 @@ prototype void vp9_d153_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, u specialize vp9_d153_predictor_4x4 prototype void vp9_v_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_v_predictor_4x4 +specialize vp9_v_predictor_4x4 sse prototype void vp9_tm_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" specialize vp9_tm_predictor_4x4 @@ -113,7 +113,7 @@ prototype void vp9_d153_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, u specialize vp9_d153_predictor_8x8 prototype void vp9_v_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_v_predictor_8x8 +specialize vp9_v_predictor_8x8 sse prototype void vp9_tm_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" specialize vp9_tm_predictor_8x8 @@ -152,7 +152,7 @@ prototype void vp9_d153_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, specialize vp9_d153_predictor_16x16 prototype void vp9_v_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_v_predictor_16x16 +specialize vp9_v_predictor_16x16 sse2 prototype void vp9_tm_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" specialize vp9_tm_predictor_16x16 @@ -191,7 +191,7 @@ prototype void vp9_d153_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, specialize vp9_d153_predictor_32x32 prototype void vp9_v_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_v_predictor_32x32 +specialize vp9_v_predictor_32x32 sse2 prototype void vp9_tm_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" specialize vp9_tm_predictor_32x32 diff --git a/vp9/common/x86/vp9_intrapred_sse2.asm b/vp9/common/x86/vp9_intrapred_sse2.asm index 550df2457..5aa3dba25 100644 --- a/vp9/common/x86/vp9_intrapred_sse2.asm +++ b/vp9/common/x86/vp9_intrapred_sse2.asm @@ -125,3 +125,66 @@ cglobal dc_predictor_32x32, 4, 4, 5, dst, stride, above, left dec lines4d jnz .loop REP_RET + +INIT_MMX sse +cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above + movd m0, [aboveq] + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + RET + +INIT_MMX sse +cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m1 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET