Add averaging-SAD functions for 8-point comp-inter motion search.
Makes first 50 frames of bus @ 1500kbps encode from 3min22.7 to 3min18.2, i.e. 2.3% faster. In addition, use the sub_pixel_avg functions to calc the variance of the averaging predictor. This is slightly suboptimal because the function is subpixel-position-aware, but it will (at least for the SSE2 version) not actually use a bilinear filter for a full-pixel position, thus leading to approximately the same performance compared to if we implemented an actual average-aware full-pixel variance function. That gains another 0.3 seconds (i.e. encode time goes to 3min17.4), thus leading to a total gain of 2.7%. Change-Id: I3f059d2b04243921868cfed2568d4fa65d7b5acd
This commit is contained in:
parent
9d95993115
commit
c24d922396
@ -369,7 +369,6 @@ specialize vp9_sad8x16 mmx sse2
|
||||
prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
|
||||
specialize vp9_sad8x8 mmx sse2
|
||||
|
||||
# TODO(jingning): need to covert these functions into mmx/sse2 form
|
||||
prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
|
||||
specialize vp9_sad8x4 sse2
|
||||
|
||||
@ -379,6 +378,45 @@ specialize vp9_sad4x8 sse
|
||||
prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
|
||||
specialize vp9_sad4x4 mmx sse
|
||||
|
||||
prototype unsigned int vp9_sad64x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
|
||||
specialize vp9_sad64x64_avg sse2
|
||||
|
||||
prototype unsigned int vp9_sad32x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
|
||||
specialize vp9_sad32x64_avg sse2
|
||||
|
||||
prototype unsigned int vp9_sad64x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
|
||||
specialize vp9_sad64x32_avg sse2
|
||||
|
||||
prototype unsigned int vp9_sad32x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
|
||||
specialize vp9_sad32x16_avg sse2
|
||||
|
||||
prototype unsigned int vp9_sad16x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
|
||||
specialize vp9_sad16x32_avg sse2
|
||||
|
||||
prototype unsigned int vp9_sad32x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
|
||||
specialize vp9_sad32x32_avg sse2
|
||||
|
||||
prototype unsigned int vp9_sad16x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
|
||||
specialize vp9_sad16x16_avg sse2
|
||||
|
||||
prototype unsigned int vp9_sad16x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
|
||||
specialize vp9_sad16x8_avg sse2
|
||||
|
||||
prototype unsigned int vp9_sad8x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
|
||||
specialize vp9_sad8x16_avg sse2
|
||||
|
||||
prototype unsigned int vp9_sad8x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
|
||||
specialize vp9_sad8x8_avg sse2
|
||||
|
||||
prototype unsigned int vp9_sad8x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
|
||||
specialize vp9_sad8x4_avg sse2
|
||||
|
||||
prototype unsigned int vp9_sad4x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
|
||||
specialize vp9_sad4x8_avg sse
|
||||
|
||||
prototype unsigned int vp9_sad4x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
|
||||
specialize vp9_sad4x4_avg sse
|
||||
|
||||
prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||
specialize vp9_variance_halfpixvar16x16_h sse2
|
||||
vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt
|
||||
|
@ -2353,16 +2353,12 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
|
||||
int *mvjsadcost = x->nmvjointsadcost;
|
||||
int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
|
||||
|
||||
/* Compound pred buffer */
|
||||
DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
|
||||
|
||||
fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
|
||||
fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
|
||||
|
||||
/* Get compound pred by averaging two pred blocks. */
|
||||
comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
|
||||
|
||||
bestsad = fn_ptr->sdf(what, what_stride, comp_pred, w, 0x7fffffff) +
|
||||
bestsad = fn_ptr->sdaf(what, what_stride, best_address, in_what_stride,
|
||||
second_pred, 0x7fffffff) +
|
||||
mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
|
||||
|
||||
for (i = 0; i < search_range; i++) {
|
||||
@ -2380,9 +2376,8 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
|
||||
best_address;
|
||||
|
||||
/* Get compound block and use it to calculate SAD. */
|
||||
comp_avg_pred(comp_pred, second_pred, w, h, check_here,
|
||||
in_what_stride);
|
||||
thissad = fn_ptr->sdf(what, what_stride, comp_pred, w, bestsad);
|
||||
thissad = fn_ptr->sdaf(what, what_stride, check_here, in_what_stride,
|
||||
second_pred, bestsad);
|
||||
|
||||
if (thissad < bestsad) {
|
||||
this_mv.as_mv.row = this_row_offset;
|
||||
@ -2412,10 +2407,11 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
|
||||
this_mv.as_mv.col = ref_mv->as_mv.col << 3;
|
||||
|
||||
if (bestsad < INT_MAX) {
|
||||
int besterr;
|
||||
comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
|
||||
besterr = fn_ptr->vf(what, what_stride, comp_pred, w,
|
||||
(unsigned int *)(&thissad)) +
|
||||
// FIXME(rbultje, yunqing): add full-pixel averaging variance functions
|
||||
// so we don't have to use the subpixel with xoff=0,yoff=0 here.
|
||||
int besterr = fn_ptr->svaf(best_address, in_what_stride, 0, 0,
|
||||
what, what_stride, (unsigned int *)(&thissad),
|
||||
second_pred) +
|
||||
mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
|
||||
xd->allow_high_precision_mv);
|
||||
return besterr;
|
||||
|
@ -1472,8 +1472,10 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
|
||||
for (i = 0; i < MAX_MODES; i++)
|
||||
cpi->rd_thresh_mult[i] = 128;
|
||||
|
||||
#define BFP(BT, SDF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF)\
|
||||
#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \
|
||||
SDX3F, SDX8F, SDX4DF)\
|
||||
cpi->fn_ptr[BT].sdf = SDF; \
|
||||
cpi->fn_ptr[BT].sdaf = SDAF; \
|
||||
cpi->fn_ptr[BT].vf = VF; \
|
||||
cpi->fn_ptr[BT].svf = SVF; \
|
||||
cpi->fn_ptr[BT].svaf = SVAF; \
|
||||
@ -1484,67 +1486,80 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
|
||||
cpi->fn_ptr[BT].sdx8f = SDX8F; \
|
||||
cpi->fn_ptr[BT].sdx4df = SDX4DF;
|
||||
|
||||
BFP(BLOCK_32X16, vp9_sad32x16, vp9_variance32x16, vp9_sub_pixel_variance32x16,
|
||||
BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg,
|
||||
vp9_variance32x16, vp9_sub_pixel_variance32x16,
|
||||
vp9_sub_pixel_avg_variance32x16, NULL, NULL,
|
||||
NULL, NULL, NULL,
|
||||
vp9_sad32x16x4d)
|
||||
|
||||
BFP(BLOCK_16X32, vp9_sad16x32, vp9_variance16x32, vp9_sub_pixel_variance16x32,
|
||||
BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg,
|
||||
vp9_variance16x32, vp9_sub_pixel_variance16x32,
|
||||
vp9_sub_pixel_avg_variance16x32, NULL, NULL,
|
||||
NULL, NULL, NULL,
|
||||
vp9_sad16x32x4d)
|
||||
|
||||
BFP(BLOCK_64X32, vp9_sad64x32, vp9_variance64x32, vp9_sub_pixel_variance64x32,
|
||||
BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg,
|
||||
vp9_variance64x32, vp9_sub_pixel_variance64x32,
|
||||
vp9_sub_pixel_avg_variance64x32, NULL, NULL,
|
||||
NULL, NULL, NULL,
|
||||
vp9_sad64x32x4d)
|
||||
|
||||
BFP(BLOCK_32X64, vp9_sad32x64, vp9_variance32x64, vp9_sub_pixel_variance32x64,
|
||||
BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg,
|
||||
vp9_variance32x64, vp9_sub_pixel_variance32x64,
|
||||
vp9_sub_pixel_avg_variance32x64, NULL, NULL,
|
||||
NULL, NULL, NULL,
|
||||
vp9_sad32x64x4d)
|
||||
|
||||
BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
|
||||
BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg,
|
||||
vp9_variance32x32, vp9_sub_pixel_variance32x32,
|
||||
vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h,
|
||||
vp9_variance_halfpixvar32x32_v,
|
||||
vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
|
||||
vp9_sad32x32x4d)
|
||||
|
||||
BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64,
|
||||
BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg,
|
||||
vp9_variance64x64, vp9_sub_pixel_variance64x64,
|
||||
vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h,
|
||||
vp9_variance_halfpixvar64x64_v,
|
||||
vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
|
||||
vp9_sad64x64x4d)
|
||||
|
||||
BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
|
||||
BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg,
|
||||
vp9_variance16x16, vp9_sub_pixel_variance16x16,
|
||||
vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h,
|
||||
vp9_variance_halfpixvar16x16_v,
|
||||
vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
|
||||
vp9_sad16x16x4d)
|
||||
|
||||
BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,
|
||||
BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg,
|
||||
vp9_variance16x8, vp9_sub_pixel_variance16x8,
|
||||
vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL,
|
||||
vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
|
||||
|
||||
BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,
|
||||
BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg,
|
||||
vp9_variance8x16, vp9_sub_pixel_variance8x16,
|
||||
vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL,
|
||||
vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
|
||||
|
||||
BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
|
||||
BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg,
|
||||
vp9_variance8x8, vp9_sub_pixel_variance8x8,
|
||||
vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,
|
||||
vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
|
||||
|
||||
BFP(BLOCK_8X4, vp9_sad8x4, vp9_variance8x4, vp9_sub_pixel_variance8x4,
|
||||
BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg,
|
||||
vp9_variance8x4, vp9_sub_pixel_variance8x4,
|
||||
vp9_sub_pixel_avg_variance8x4, NULL, NULL,
|
||||
NULL, NULL, vp9_sad8x4x8,
|
||||
vp9_sad8x4x4d)
|
||||
|
||||
BFP(BLOCK_4X8, vp9_sad4x8, vp9_variance4x8, vp9_sub_pixel_variance4x8,
|
||||
BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg,
|
||||
vp9_variance4x8, vp9_sub_pixel_variance4x8,
|
||||
vp9_sub_pixel_avg_variance4x8, NULL, NULL,
|
||||
NULL, NULL, vp9_sad4x8x8,
|
||||
vp9_sad4x8x4d)
|
||||
|
||||
BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
|
||||
BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg,
|
||||
vp9_variance4x4, vp9_sub_pixel_variance4x4,
|
||||
vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,
|
||||
vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
|
||||
|
||||
|
@ -11,25 +11,43 @@
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "vp9/common/vp9_sadmxn.h"
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
#include "./vpx_config.h"
|
||||
#include "vpx/vpx_integer.h"
|
||||
#include "./vp9_rtcd.h"
|
||||
|
||||
unsigned int vp9_sad64x64_c(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride,
|
||||
unsigned int max_sad) {
|
||||
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 64);
|
||||
#define sad_mxn_func(m, n) \
|
||||
unsigned int vp9_sad##m##x##n##_c(const uint8_t *src_ptr, \
|
||||
int src_stride, \
|
||||
const uint8_t *ref_ptr, \
|
||||
int ref_stride, \
|
||||
unsigned int max_sad) { \
|
||||
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
|
||||
} \
|
||||
unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src_ptr, \
|
||||
int src_stride, \
|
||||
const uint8_t *ref_ptr, \
|
||||
int ref_stride, \
|
||||
const uint8_t *second_pred, \
|
||||
unsigned int max_sad) { \
|
||||
uint8_t comp_pred[m * n]; \
|
||||
comp_avg_pred(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \
|
||||
return sad_mx_n_c(src_ptr, src_stride, comp_pred, m, m, n); \
|
||||
}
|
||||
|
||||
unsigned int vp9_sad64x32_c(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride,
|
||||
unsigned int max_sad) {
|
||||
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 32);
|
||||
}
|
||||
sad_mxn_func(64, 64)
|
||||
sad_mxn_func(64, 32)
|
||||
sad_mxn_func(32, 64)
|
||||
sad_mxn_func(32, 32)
|
||||
sad_mxn_func(32, 16)
|
||||
sad_mxn_func(16, 32)
|
||||
sad_mxn_func(16, 16)
|
||||
sad_mxn_func(16, 8)
|
||||
sad_mxn_func(8, 16)
|
||||
sad_mxn_func(8, 8)
|
||||
sad_mxn_func(8, 4)
|
||||
sad_mxn_func(4, 8)
|
||||
sad_mxn_func(4, 4)
|
||||
|
||||
void vp9_sad64x32x4d_c(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
@ -46,14 +64,6 @@ void vp9_sad64x32x4d_c(const uint8_t *src_ptr,
|
||||
ref_ptr[3], ref_stride, 0x7fffffff);
|
||||
}
|
||||
|
||||
unsigned int vp9_sad32x64_c(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride,
|
||||
unsigned int max_sad) {
|
||||
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 64);
|
||||
}
|
||||
|
||||
void vp9_sad32x64x4d_c(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t* const ref_ptr[],
|
||||
@ -69,22 +79,6 @@ void vp9_sad32x64x4d_c(const uint8_t *src_ptr,
|
||||
ref_ptr[3], ref_stride, 0x7fffffff);
|
||||
}
|
||||
|
||||
unsigned int vp9_sad32x32_c(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride,
|
||||
unsigned int max_sad) {
|
||||
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);
|
||||
}
|
||||
|
||||
unsigned int vp9_sad32x16_c(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride,
|
||||
unsigned int max_sad) {
|
||||
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 16);
|
||||
}
|
||||
|
||||
void vp9_sad32x16x4d_c(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t* const ref_ptr[],
|
||||
@ -100,14 +94,6 @@ void vp9_sad32x16x4d_c(const uint8_t *src_ptr,
|
||||
ref_ptr[3], ref_stride, 0x7fffffff);
|
||||
}
|
||||
|
||||
unsigned int vp9_sad16x32_c(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride,
|
||||
unsigned int max_sad) {
|
||||
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 32);
|
||||
}
|
||||
|
||||
void vp9_sad16x32x4d_c(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t* const ref_ptr[],
|
||||
@ -123,63 +109,6 @@ void vp9_sad16x32x4d_c(const uint8_t *src_ptr,
|
||||
ref_ptr[3], ref_stride, 0x7fffffff);
|
||||
}
|
||||
|
||||
unsigned int vp9_sad16x16_c(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride,
|
||||
unsigned int max_sad) {
|
||||
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16);
|
||||
}
|
||||
|
||||
unsigned int vp9_sad8x8_c(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride,
|
||||
unsigned int max_sad) {
|
||||
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8);
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp9_sad16x8_c(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride,
|
||||
unsigned int max_sad) {
|
||||
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8);
|
||||
}
|
||||
|
||||
unsigned int vp9_sad8x16_c(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride,
|
||||
unsigned int max_sad) {
|
||||
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);
|
||||
}
|
||||
|
||||
unsigned int vp9_sad8x4_c(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride,
|
||||
unsigned int max_sad) {
|
||||
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 4);
|
||||
}
|
||||
|
||||
unsigned int vp9_sad4x8_c(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride,
|
||||
unsigned int max_sad) {
|
||||
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 8);
|
||||
}
|
||||
|
||||
unsigned int vp9_sad4x4_c(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride,
|
||||
unsigned int max_sad) {
|
||||
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
|
||||
}
|
||||
|
||||
void vp9_sad64x64x3_c(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
|
@ -20,6 +20,13 @@ typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
|
||||
int ref_stride,
|
||||
unsigned int max_sad);
|
||||
|
||||
typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr,
|
||||
int source_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride,
|
||||
const uint8_t *second_pred,
|
||||
unsigned int max_sad);
|
||||
|
||||
typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
|
||||
int source_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
@ -74,20 +81,21 @@ typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr,
|
||||
int ref_stride);
|
||||
|
||||
typedef struct vp9_variance_vtable {
|
||||
vp9_sad_fn_t sdf;
|
||||
vp9_variance_fn_t vf;
|
||||
vp9_subpixvariance_fn_t svf;
|
||||
vp9_subp_avg_variance_fn_t svaf;
|
||||
vp9_variance_fn_t svf_halfpix_h;
|
||||
vp9_variance_fn_t svf_halfpix_v;
|
||||
vp9_variance_fn_t svf_halfpix_hv;
|
||||
vp9_sad_multi_fn_t sdx3f;
|
||||
vp9_sad_multi1_fn_t sdx8f;
|
||||
vp9_sad_multi_d_fn_t sdx4df;
|
||||
vp9_sad_fn_t sdf;
|
||||
vp9_sad_avg_fn_t sdaf;
|
||||
vp9_variance_fn_t vf;
|
||||
vp9_subpixvariance_fn_t svf;
|
||||
vp9_subp_avg_variance_fn_t svaf;
|
||||
vp9_variance_fn_t svf_halfpix_h;
|
||||
vp9_variance_fn_t svf_halfpix_v;
|
||||
vp9_variance_fn_t svf_halfpix_hv;
|
||||
vp9_sad_multi_fn_t sdx3f;
|
||||
vp9_sad_multi1_fn_t sdx8f;
|
||||
vp9_sad_multi_d_fn_t sdx4df;
|
||||
} vp9_variance_fn_ptr_t;
|
||||
|
||||
static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
|
||||
int height, uint8_t *ref, int ref_stride) {
|
||||
int height, const uint8_t *ref, int ref_stride) {
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < height; i++) {
|
||||
|
@ -12,12 +12,42 @@
|
||||
|
||||
SECTION .text
|
||||
|
||||
; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref, int ref_stride);
|
||||
%macro SAD64XN 1
|
||||
cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
|
||||
%macro SAD_FN 4
|
||||
%if %4 == 0
|
||||
%if %3 == 5
|
||||
cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
|
||||
%else ; %3 == 7
|
||||
cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \
|
||||
src_stride3, ref_stride3, n_rows
|
||||
%endif ; %3 == 5/7
|
||||
%else ; avg
|
||||
%if %3 == 5
|
||||
cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
|
||||
second_pred, n_rows
|
||||
%else ; %3 == 7
|
||||
cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \
|
||||
ref, ref_stride, \
|
||||
second_pred, \
|
||||
src_stride3, ref_stride3
|
||||
%if ARCH_X86_64
|
||||
%define n_rowsd r7d
|
||||
%else ; x86-32
|
||||
%define n_rowsd dword r0m
|
||||
%endif ; x86-32/64
|
||||
%endif ; %3 == 5/7
|
||||
%endif ; avg/sad
|
||||
movsxdifnidn src_strideq, src_strided
|
||||
movsxdifnidn ref_strideq, ref_strided
|
||||
%if %3 == 7
|
||||
lea src_stride3q, [src_strideq*3]
|
||||
lea ref_stride3q, [ref_strideq*3]
|
||||
%endif ; %3 == 7
|
||||
%endmacro
|
||||
|
||||
; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref, int ref_stride);
|
||||
%macro SAD64XN 1-2 0
|
||||
SAD_FN 64, %1, 5, %2
|
||||
mov n_rowsd, %1
|
||||
pxor m0, m0
|
||||
.loop:
|
||||
@ -25,6 +55,13 @@ cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
|
||||
movu m2, [refq+16]
|
||||
movu m3, [refq+32]
|
||||
movu m4, [refq+48]
|
||||
%if %2 == 1
|
||||
pavgb m1, [second_predq+mmsize*0]
|
||||
pavgb m2, [second_predq+mmsize*1]
|
||||
pavgb m3, [second_predq+mmsize*2]
|
||||
pavgb m4, [second_predq+mmsize*3]
|
||||
lea second_predq, [second_predq+mmsize*4]
|
||||
%endif
|
||||
psadbw m1, [srcq]
|
||||
psadbw m2, [srcq+16]
|
||||
psadbw m3, [srcq+32]
|
||||
@ -47,21 +84,27 @@ cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
|
||||
INIT_XMM sse2
|
||||
SAD64XN 64 ; sad64x64_sse2
|
||||
SAD64XN 32 ; sad64x32_sse2
|
||||
SAD64XN 64, 1 ; sad64x64_avg_sse2
|
||||
SAD64XN 32, 1 ; sad64x32_avg_sse2
|
||||
|
||||
; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref, int ref_stride);
|
||||
%macro SAD32XN 1
|
||||
cglobal sad32x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
|
||||
movsxdifnidn src_strideq, src_strided
|
||||
movsxdifnidn ref_strideq, ref_strided
|
||||
%macro SAD32XN 1-2 0
|
||||
SAD_FN 32, %1, 5, %2
|
||||
mov n_rowsd, %1/2
|
||||
pxor m0, m0
|
||||
|
||||
.loop:
|
||||
movu m1, [refq]
|
||||
movu m2, [refq+16]
|
||||
movu m3, [refq+ref_strideq]
|
||||
movu m4, [refq+ref_strideq+16]
|
||||
%if %2 == 1
|
||||
pavgb m1, [second_predq+mmsize*0]
|
||||
pavgb m2, [second_predq+mmsize*1]
|
||||
pavgb m3, [second_predq+mmsize*2]
|
||||
pavgb m4, [second_predq+mmsize*3]
|
||||
lea second_predq, [second_predq+mmsize*4]
|
||||
%endif
|
||||
psadbw m1, [srcq]
|
||||
psadbw m2, [srcq+16]
|
||||
psadbw m3, [srcq+src_strideq]
|
||||
@ -85,16 +128,14 @@ INIT_XMM sse2
|
||||
SAD32XN 64 ; sad32x64_sse2
|
||||
SAD32XN 32 ; sad32x32_sse2
|
||||
SAD32XN 16 ; sad32x16_sse2
|
||||
SAD32XN 64, 1 ; sad32x64_avg_sse2
|
||||
SAD32XN 32, 1 ; sad32x32_avg_sse2
|
||||
SAD32XN 16, 1 ; sad32x16_avg_sse2
|
||||
|
||||
; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref, int ref_stride);
|
||||
%macro SAD16XN 1
|
||||
cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
|
||||
src_stride3, ref_stride3, n_rows
|
||||
movsxdifnidn src_strideq, src_strided
|
||||
movsxdifnidn ref_strideq, ref_strided
|
||||
lea src_stride3q, [src_strideq*3]
|
||||
lea ref_stride3q, [ref_strideq*3]
|
||||
%macro SAD16XN 1-2 0
|
||||
SAD_FN 16, %1, 7, %2
|
||||
mov n_rowsd, %1/4
|
||||
pxor m0, m0
|
||||
|
||||
@ -103,6 +144,13 @@ cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
|
||||
movu m2, [refq+ref_strideq]
|
||||
movu m3, [refq+ref_strideq*2]
|
||||
movu m4, [refq+ref_stride3q]
|
||||
%if %2 == 1
|
||||
pavgb m1, [second_predq+mmsize*0]
|
||||
pavgb m2, [second_predq+mmsize*1]
|
||||
pavgb m3, [second_predq+mmsize*2]
|
||||
pavgb m4, [second_predq+mmsize*3]
|
||||
lea second_predq, [second_predq+mmsize*4]
|
||||
%endif
|
||||
psadbw m1, [srcq]
|
||||
psadbw m2, [srcq+src_strideq]
|
||||
psadbw m3, [srcq+src_strideq*2]
|
||||
@ -126,16 +174,14 @@ INIT_XMM sse2
|
||||
SAD16XN 32 ; sad16x32_sse2
|
||||
SAD16XN 16 ; sad16x16_sse2
|
||||
SAD16XN 8 ; sad16x8_sse2
|
||||
SAD16XN 32, 1 ; sad16x32_avg_sse2
|
||||
SAD16XN 16, 1 ; sad16x16_avg_sse2
|
||||
SAD16XN 8, 1 ; sad16x8_avg_sse2
|
||||
|
||||
; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref, int ref_stride);
|
||||
%macro SAD8XN 1
|
||||
cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
|
||||
src_stride3, ref_stride3, n_rows
|
||||
movsxdifnidn src_strideq, src_strided
|
||||
movsxdifnidn ref_strideq, ref_strided
|
||||
lea src_stride3q, [src_strideq*3]
|
||||
lea ref_stride3q, [ref_strideq*3]
|
||||
%macro SAD8XN 1-2 0
|
||||
SAD_FN 8, %1, 7, %2
|
||||
mov n_rowsd, %1/4
|
||||
pxor m0, m0
|
||||
|
||||
@ -144,6 +190,11 @@ cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
|
||||
movhps m1, [refq+ref_strideq]
|
||||
movh m2, [refq+ref_strideq*2]
|
||||
movhps m2, [refq+ref_stride3q]
|
||||
%if %2 == 1
|
||||
pavgb m1, [second_predq+mmsize*0]
|
||||
pavgb m2, [second_predq+mmsize*1]
|
||||
lea second_predq, [second_predq+mmsize*2]
|
||||
%endif
|
||||
movh m3, [srcq]
|
||||
movhps m3, [srcq+src_strideq]
|
||||
movh m4, [srcq+src_strideq*2]
|
||||
@ -167,16 +218,14 @@ INIT_XMM sse2
|
||||
SAD8XN 16 ; sad8x16_sse2
|
||||
SAD8XN 8 ; sad8x8_sse2
|
||||
SAD8XN 4 ; sad8x4_sse2
|
||||
SAD8XN 16, 1 ; sad8x16_avg_sse2
|
||||
SAD8XN 8, 1 ; sad8x8_avg_sse2
|
||||
SAD8XN 4, 1 ; sad8x4_avg_sse2
|
||||
|
||||
; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref, int ref_stride);
|
||||
%macro SAD4XN 1
|
||||
cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \
|
||||
src_stride3, ref_stride3, n_rows
|
||||
movsxdifnidn src_strideq, src_strided
|
||||
movsxdifnidn ref_strideq, ref_strided
|
||||
lea src_stride3q, [src_strideq*3]
|
||||
lea ref_stride3q, [ref_strideq*3]
|
||||
%macro SAD4XN 1-2 0
|
||||
SAD_FN 4, %1, 7, %2
|
||||
mov n_rowsd, %1/4
|
||||
pxor m0, m0
|
||||
|
||||
@ -187,6 +236,11 @@ cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \
|
||||
movd m4, [refq+ref_stride3q]
|
||||
punpckldq m1, m2
|
||||
punpckldq m3, m4
|
||||
%if %2 == 1
|
||||
pavgb m1, [second_predq+mmsize*0]
|
||||
pavgb m3, [second_predq+mmsize*1]
|
||||
lea second_predq, [second_predq+mmsize*2]
|
||||
%endif
|
||||
movd m2, [srcq]
|
||||
movd m5, [srcq+src_strideq]
|
||||
movd m4, [srcq+src_strideq*2]
|
||||
@ -209,3 +263,5 @@ cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \
|
||||
INIT_MMX sse
|
||||
SAD4XN 8 ; sad4x8_sse
|
||||
SAD4XN 4 ; sad4x4_sse
|
||||
SAD4XN 8, 1 ; sad4x8_avg_sse
|
||||
SAD4XN 4, 1 ; sad4x4_avg_sse
|
||||
|
Loading…
x
Reference in New Issue
Block a user