diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index b956e6af9..d71def519 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -369,7 +369,6 @@ specialize vp9_sad8x16 mmx sse2 prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" specialize vp9_sad8x8 mmx sse2 -# TODO(jingning): need to covert these functions into mmx/sse2 form prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" specialize vp9_sad8x4 sse2 @@ -379,6 +378,45 @@ specialize vp9_sad4x8 sse prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" specialize vp9_sad4x4 mmx sse +prototype unsigned int vp9_sad64x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad64x64_avg sse2 + +prototype unsigned int vp9_sad32x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad32x64_avg sse2 + +prototype unsigned int vp9_sad64x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad64x32_avg sse2 + +prototype unsigned int vp9_sad32x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad32x16_avg sse2 + +prototype unsigned int vp9_sad16x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad16x32_avg sse2 + +prototype unsigned int vp9_sad32x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad32x32_avg sse2 + +prototype unsigned int vp9_sad16x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad16x16_avg sse2 + +prototype unsigned int vp9_sad16x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad16x8_avg sse2 + +prototype unsigned int vp9_sad8x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad8x16_avg sse2 + +prototype unsigned int vp9_sad8x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad8x8_avg sse2 + +prototype unsigned int vp9_sad8x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad8x4_avg sse2 + +prototype unsigned int vp9_sad4x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad4x8_avg sse + +prototype unsigned int vp9_sad4x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad4x4_avg sse + prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_variance_halfpixvar16x16_h sse2 vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 53b70adc4..212dce3b8 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -2353,16 +2353,12 @@ int vp9_refining_search_8p_c(MACROBLOCK *x, int *mvjsadcost = x->nmvjointsadcost; int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - /* Compound pred buffer */ - DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); - fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; /* Get compound pred by averaging two pred blocks. */ - comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride); - - bestsad = fn_ptr->sdf(what, what_stride, comp_pred, w, 0x7fffffff) + + bestsad = fn_ptr->sdaf(what, what_stride, best_address, in_what_stride, + second_pred, 0x7fffffff) + mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit); for (i = 0; i < search_range; i++) { @@ -2380,9 +2376,8 @@ int vp9_refining_search_8p_c(MACROBLOCK *x, best_address; /* Get compound block and use it to calculate SAD. */ - comp_avg_pred(comp_pred, second_pred, w, h, check_here, - in_what_stride); - thissad = fn_ptr->sdf(what, what_stride, comp_pred, w, bestsad); + thissad = fn_ptr->sdaf(what, what_stride, check_here, in_what_stride, + second_pred, bestsad); if (thissad < bestsad) { this_mv.as_mv.row = this_row_offset; @@ -2412,10 +2407,11 @@ int vp9_refining_search_8p_c(MACROBLOCK *x, this_mv.as_mv.col = ref_mv->as_mv.col << 3; if (bestsad < INT_MAX) { - int besterr; - comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride); - besterr = fn_ptr->vf(what, what_stride, comp_pred, w, - (unsigned int *)(&thissad)) + + // FIXME(rbultje, yunqing): add full-pixel averaging variance functions + // so we don't have to use the subpixel with xoff=0,yoff=0 here. + int besterr = fn_ptr->svaf(best_address, in_what_stride, 0, 0, + what, what_stride, (unsigned int *)(&thissad), + second_pred) + mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit, xd->allow_high_precision_mv); return besterr; diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index 48a8b48c6..d6fa3fa6c 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -1472,8 +1472,10 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { for (i = 0; i < MAX_MODES; i++) cpi->rd_thresh_mult[i] = 128; -#define BFP(BT, SDF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF)\ +#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \ + SDX3F, SDX8F, SDX4DF)\ cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ cpi->fn_ptr[BT].vf = VF; \ cpi->fn_ptr[BT].svf = SVF; \ cpi->fn_ptr[BT].svaf = SVAF; \ @@ -1484,67 +1486,80 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { cpi->fn_ptr[BT].sdx8f = SDX8F; \ cpi->fn_ptr[BT].sdx4df = SDX4DF; - BFP(BLOCK_32X16, vp9_sad32x16, vp9_variance32x16, vp9_sub_pixel_variance32x16, + BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg, + vp9_variance32x16, vp9_sub_pixel_variance32x16, vp9_sub_pixel_avg_variance32x16, NULL, NULL, NULL, NULL, NULL, vp9_sad32x16x4d) - BFP(BLOCK_16X32, vp9_sad16x32, vp9_variance16x32, vp9_sub_pixel_variance16x32, + BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg, + vp9_variance16x32, vp9_sub_pixel_variance16x32, vp9_sub_pixel_avg_variance16x32, NULL, NULL, NULL, NULL, NULL, vp9_sad16x32x4d) - BFP(BLOCK_64X32, vp9_sad64x32, vp9_variance64x32, vp9_sub_pixel_variance64x32, + BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg, + vp9_variance64x32, vp9_sub_pixel_variance64x32, vp9_sub_pixel_avg_variance64x32, NULL, NULL, NULL, NULL, NULL, vp9_sad64x32x4d) - BFP(BLOCK_32X64, vp9_sad32x64, vp9_variance32x64, vp9_sub_pixel_variance32x64, + BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg, + vp9_variance32x64, vp9_sub_pixel_variance32x64, vp9_sub_pixel_avg_variance32x64, NULL, NULL, NULL, NULL, NULL, vp9_sad32x64x4d) - BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32, + BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg, + vp9_variance32x32, vp9_sub_pixel_variance32x32, vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v, vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8, vp9_sad32x32x4d) - BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64, + BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg, + vp9_variance64x64, vp9_sub_pixel_variance64x64, vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h, vp9_variance_halfpixvar64x64_v, vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8, vp9_sad64x64x4d) - BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16, + BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg, + vp9_variance16x16, vp9_sub_pixel_variance16x16, vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h, vp9_variance_halfpixvar16x16_v, vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8, vp9_sad16x16x4d) - BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8, + BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg, + vp9_variance16x8, vp9_sub_pixel_variance16x8, vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL, vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d) - BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16, + BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg, + vp9_variance8x16, vp9_sub_pixel_variance8x16, vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL, vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d) - BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8, + BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg, + vp9_variance8x8, vp9_sub_pixel_variance8x8, vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d) - BFP(BLOCK_8X4, vp9_sad8x4, vp9_variance8x4, vp9_sub_pixel_variance8x4, + BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg, + vp9_variance8x4, vp9_sub_pixel_variance8x4, vp9_sub_pixel_avg_variance8x4, NULL, NULL, NULL, NULL, vp9_sad8x4x8, vp9_sad8x4x4d) - BFP(BLOCK_4X8, vp9_sad4x8, vp9_variance4x8, vp9_sub_pixel_variance4x8, + BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg, + vp9_variance4x8, vp9_sub_pixel_variance4x8, vp9_sub_pixel_avg_variance4x8, NULL, NULL, NULL, NULL, vp9_sad4x8x8, vp9_sad4x8x4d) - BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4, + BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg, + vp9_variance4x4, vp9_sub_pixel_variance4x4, vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d) diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad_c.c index 6b1ba4964..42ddb21a5 100644 --- a/vp9/encoder/vp9_sad_c.c +++ b/vp9/encoder/vp9_sad_c.c @@ -11,25 +11,43 @@ #include #include "vp9/common/vp9_sadmxn.h" +#include "vp9/encoder/vp9_variance.h" #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "./vp9_rtcd.h" -unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 64); +#define sad_mxn_func(m, n) \ +unsigned int vp9_sad##m##x##n##_c(const uint8_t *src_ptr, \ + int src_stride, \ + const uint8_t *ref_ptr, \ + int ref_stride, \ + unsigned int max_sad) { \ + return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ +} \ +unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src_ptr, \ + int src_stride, \ + const uint8_t *ref_ptr, \ + int ref_stride, \ + const uint8_t *second_pred, \ + unsigned int max_sad) { \ + uint8_t comp_pred[m * n]; \ + comp_avg_pred(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \ + return sad_mx_n_c(src_ptr, src_stride, comp_pred, m, m, n); \ } -unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 32); -} +sad_mxn_func(64, 64) +sad_mxn_func(64, 32) +sad_mxn_func(32, 64) +sad_mxn_func(32, 32) +sad_mxn_func(32, 16) +sad_mxn_func(16, 32) +sad_mxn_func(16, 16) +sad_mxn_func(16, 8) +sad_mxn_func(8, 16) +sad_mxn_func(8, 8) +sad_mxn_func(8, 4) +sad_mxn_func(4, 8) +sad_mxn_func(4, 4) void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, @@ -46,14 +64,6 @@ void vp9_sad64x32x4d_c(const uint8_t *src_ptr, ref_ptr[3], ref_stride, 0x7fffffff); } -unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 64); -} - void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], @@ -69,22 +79,6 @@ void vp9_sad32x64x4d_c(const uint8_t *src_ptr, ref_ptr[3], ref_stride, 0x7fffffff); } -unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32); -} - -unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 16); -} - void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], @@ -100,14 +94,6 @@ void vp9_sad32x16x4d_c(const uint8_t *src_ptr, ref_ptr[3], ref_stride, 0x7fffffff); } -unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 32); -} - void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], @@ -123,63 +109,6 @@ void vp9_sad16x32x4d_c(const uint8_t *src_ptr, ref_ptr[3], ref_stride, 0x7fffffff); } -unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16); -} - -unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8); -} - - -unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8); -} - -unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16); -} - -unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 4); -} - -unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 8); -} - -unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4); -} - void vp9_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h index 38808d7be..6e686d6f9 100644 --- a/vp9/encoder/vp9_variance.h +++ b/vp9/encoder/vp9_variance.h @@ -20,6 +20,13 @@ typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr, int ref_stride, unsigned int max_sad); +typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr, + int source_stride, + const uint8_t *ref_ptr, + int ref_stride, + const uint8_t *second_pred, + unsigned int max_sad); + typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, @@ -74,20 +81,21 @@ typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr, int ref_stride); typedef struct vp9_variance_vtable { - vp9_sad_fn_t sdf; - vp9_variance_fn_t vf; - vp9_subpixvariance_fn_t svf; - vp9_subp_avg_variance_fn_t svaf; - vp9_variance_fn_t svf_halfpix_h; - vp9_variance_fn_t svf_halfpix_v; - vp9_variance_fn_t svf_halfpix_hv; - vp9_sad_multi_fn_t sdx3f; - vp9_sad_multi1_fn_t sdx8f; - vp9_sad_multi_d_fn_t sdx4df; + vp9_sad_fn_t sdf; + vp9_sad_avg_fn_t sdaf; + vp9_variance_fn_t vf; + vp9_subpixvariance_fn_t svf; + vp9_subp_avg_variance_fn_t svaf; + vp9_variance_fn_t svf_halfpix_h; + vp9_variance_fn_t svf_halfpix_v; + vp9_variance_fn_t svf_halfpix_hv; + vp9_sad_multi_fn_t sdx3f; + vp9_sad_multi1_fn_t sdx8f; + vp9_sad_multi_d_fn_t sdx4df; } vp9_variance_fn_ptr_t; static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width, - int height, uint8_t *ref, int ref_stride) { + int height, const uint8_t *ref, int ref_stride) { int i, j; for (i = 0; i < height; i++) { diff --git a/vp9/encoder/x86/vp9_sad_sse2.asm b/vp9/encoder/x86/vp9_sad_sse2.asm index 8fb7d4118..c4c5c54f0 100644 --- a/vp9/encoder/x86/vp9_sad_sse2.asm +++ b/vp9/encoder/x86/vp9_sad_sse2.asm @@ -12,12 +12,42 @@ SECTION .text -; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro SAD64XN 1 -cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows +%macro SAD_FN 4 +%if %4 == 0 +%if %3 == 5 +cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%else ; avg +%if %3 == 5 +cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ + second_pred, n_rows +%else ; %3 == 7 +cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \ + ref, ref_stride, \ + second_pred, \ + src_stride3, ref_stride3 +%if ARCH_X86_64 +%define n_rowsd r7d +%else ; x86-32 +%define n_rowsd dword r0m +%endif ; x86-32/64 +%endif ; %3 == 5/7 +%endif ; avg/sad movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided +%if %3 == 7 + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] +%endif ; %3 == 7 +%endmacro + +; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD64XN 1-2 0 + SAD_FN 64, %1, 5, %2 mov n_rowsd, %1 pxor m0, m0 .loop: @@ -25,6 +55,13 @@ cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows movu m2, [refq+16] movu m3, [refq+32] movu m4, [refq+48] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif psadbw m1, [srcq] psadbw m2, [srcq+16] psadbw m3, [srcq+32] @@ -47,21 +84,27 @@ cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows INIT_XMM sse2 SAD64XN 64 ; sad64x64_sse2 SAD64XN 32 ; sad64x32_sse2 +SAD64XN 64, 1 ; sad64x64_avg_sse2 +SAD64XN 32, 1 ; sad64x32_avg_sse2 ; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); -%macro SAD32XN 1 -cglobal sad32x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided +%macro SAD32XN 1-2 0 + SAD_FN 32, %1, 5, %2 mov n_rowsd, %1/2 pxor m0, m0 - .loop: movu m1, [refq] movu m2, [refq+16] movu m3, [refq+ref_strideq] movu m4, [refq+ref_strideq+16] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif psadbw m1, [srcq] psadbw m2, [srcq+16] psadbw m3, [srcq+src_strideq] @@ -85,16 +128,14 @@ INIT_XMM sse2 SAD32XN 64 ; sad32x64_sse2 SAD32XN 32 ; sad32x32_sse2 SAD32XN 16 ; sad32x16_sse2 +SAD32XN 64, 1 ; sad32x64_avg_sse2 +SAD32XN 32, 1 ; sad32x32_avg_sse2 +SAD32XN 16, 1 ; sad32x16_avg_sse2 ; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); -%macro SAD16XN 1 -cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \ - src_stride3, ref_stride3, n_rows - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided - lea src_stride3q, [src_strideq*3] - lea ref_stride3q, [ref_strideq*3] +%macro SAD16XN 1-2 0 + SAD_FN 16, %1, 7, %2 mov n_rowsd, %1/4 pxor m0, m0 @@ -103,6 +144,13 @@ cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \ movu m2, [refq+ref_strideq] movu m3, [refq+ref_strideq*2] movu m4, [refq+ref_stride3q] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif psadbw m1, [srcq] psadbw m2, [srcq+src_strideq] psadbw m3, [srcq+src_strideq*2] @@ -126,16 +174,14 @@ INIT_XMM sse2 SAD16XN 32 ; sad16x32_sse2 SAD16XN 16 ; sad16x16_sse2 SAD16XN 8 ; sad16x8_sse2 +SAD16XN 32, 1 ; sad16x32_avg_sse2 +SAD16XN 16, 1 ; sad16x16_avg_sse2 +SAD16XN 8, 1 ; sad16x8_avg_sse2 ; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); -%macro SAD8XN 1 -cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \ - src_stride3, ref_stride3, n_rows - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided - lea src_stride3q, [src_strideq*3] - lea ref_stride3q, [ref_strideq*3] +%macro SAD8XN 1-2 0 + SAD_FN 8, %1, 7, %2 mov n_rowsd, %1/4 pxor m0, m0 @@ -144,6 +190,11 @@ cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \ movhps m1, [refq+ref_strideq] movh m2, [refq+ref_strideq*2] movhps m2, [refq+ref_stride3q] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + lea second_predq, [second_predq+mmsize*2] +%endif movh m3, [srcq] movhps m3, [srcq+src_strideq] movh m4, [srcq+src_strideq*2] @@ -167,16 +218,14 @@ INIT_XMM sse2 SAD8XN 16 ; sad8x16_sse2 SAD8XN 8 ; sad8x8_sse2 SAD8XN 4 ; sad8x4_sse2 +SAD8XN 16, 1 ; sad8x16_avg_sse2 +SAD8XN 8, 1 ; sad8x8_avg_sse2 +SAD8XN 4, 1 ; sad8x4_avg_sse2 ; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); -%macro SAD4XN 1 -cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \ - src_stride3, ref_stride3, n_rows - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided - lea src_stride3q, [src_strideq*3] - lea ref_stride3q, [ref_strideq*3] +%macro SAD4XN 1-2 0 + SAD_FN 4, %1, 7, %2 mov n_rowsd, %1/4 pxor m0, m0 @@ -187,6 +236,11 @@ cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \ movd m4, [refq+ref_stride3q] punpckldq m1, m2 punpckldq m3, m4 +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m3, [second_predq+mmsize*1] + lea second_predq, [second_predq+mmsize*2] +%endif movd m2, [srcq] movd m5, [srcq+src_strideq] movd m4, [srcq+src_strideq*2] @@ -209,3 +263,5 @@ cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \ INIT_MMX sse SAD4XN 8 ; sad4x8_sse SAD4XN 4 ; sad4x4_sse +SAD4XN 8, 1 ; sad4x8_avg_sse +SAD4XN 4, 1 ; sad4x4_avg_sse