Added sse2 acceleration for highbitdepth variance

Change-Id: I446bdf3a405e4e9d2aa633d6281d66ea0cdfd79f
(cherry picked from commit d7422b2b1eb9f0011a8c379c2be680d6892b16bc)
(cherry picked from commit 6d741e4d76a7d9ece69ca117d1d9e2f9ee48ef8c)
This commit is contained in:
Peter de Rivaz 2014-10-16 14:00:54 +01:00 committed by Deb Mukherjee
parent 807885b5e0
commit 48032bfcdb
7 changed files with 3289 additions and 300 deletions

File diff suppressed because it is too large Load Diff

@ -112,6 +112,9 @@ typedef struct {
// Common for both INTER and INTRA blocks
BLOCK_SIZE sb_type;
PREDICTION_MODE mode;
#if CONFIG_FILTERINTRA
int filterbit, uv_filterbit;
#endif
TX_SIZE tx_size;
int8_t skip;
int8_t segment_id;
@ -126,11 +129,18 @@ typedef struct {
int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
uint8_t mode_context[MAX_REF_FRAMES];
INTERP_FILTER interp_filter;
#if CONFIG_EXT_TX
EXT_TX_TYPE ext_txfrm;
#endif
} MB_MODE_INFO;
typedef struct MODE_INFO {
struct MODE_INFO *src_mi;
MB_MODE_INFO mbmi;
#if CONFIG_FILTERINTRA
int b_filter_info[4];
#endif
b_mode_info bmi[4];
} MODE_INFO;
@ -139,6 +149,17 @@ static INLINE PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) {
: mi->mbmi.mode;
}
#if CONFIG_FILTERINTRA
static INLINE int is_filter_allowed(PREDICTION_MODE mode) {
(void)mode;
return 1;
}
static INLINE int is_filter_enabled(TX_SIZE txsize) {
return (txsize < TX_SIZES);
}
#endif
static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
return mbmi->ref_frame[0] > INTRA_FRAME;
}
@ -236,12 +257,33 @@ static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
extern const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES];
#if CONFIG_EXT_TX
static TX_TYPE ext_tx_to_txtype(EXT_TX_TYPE ext_tx) {
switch (ext_tx) {
case NORM:
default:
return DCT_DCT;
case ALT:
return ADST_ADST;
}
}
#endif
static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type,
const MACROBLOCKD *xd) {
const MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
if (plane_type != PLANE_TYPE_Y || is_inter_block(mbmi))
#if CONFIG_EXT_TX
if (plane_type != PLANE_TYPE_Y || xd->lossless)
return DCT_DCT;
if (is_inter_block(mbmi)) {
return ext_tx_to_txtype(mbmi->ext_txfrm);
}
#else
if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mbmi))
return DCT_DCT;
#endif
return intra_mode_to_tx_type_lookup[mbmi->mode];
}
@ -249,8 +291,17 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
const MACROBLOCKD *xd, int ib) {
const MODE_INFO *const mi = xd->mi[0].src_mi;
#if CONFIG_EXT_TX
if (plane_type != PLANE_TYPE_Y || xd->lossless)
return DCT_DCT;
if (is_inter_block(&mi->mbmi)) {
return ext_tx_to_txtype(mi->mbmi.ext_txfrm);
}
#else
if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(&mi->mbmi))
return DCT_DCT;
#endif
return intra_mode_to_tx_type_lookup[get_y_mode(mi, ib)];
}

@ -1283,34 +1283,34 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# variance
add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance32x16/;
specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance16x32/;
specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance64x32/;
specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance32x64/;
specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance32x32/;
specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance64x64/;
specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance16x16/;
specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance16x8/;
specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance8x16/;
specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance8x8/;
specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance8x4/;
@ -1322,40 +1322,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_variance4x4/;
add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_get8x8var/;
specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc";
add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_get16x16var/;
specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance32x16/;
specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance16x32/;
specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance64x32/;
specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance32x64/;
specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance32x32/;
specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance64x64/;
specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance16x16/;
specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance16x8/;
specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance8x16/;
specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance8x8/;
specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance8x4/;
@ -1367,40 +1367,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_10_variance4x4/;
add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_10_get8x8var/;
specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc";
add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_10_get16x16var/;
specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance32x16/;
specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance16x32/;
specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance64x32/;
specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance32x64/;
specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance32x32/;
specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance64x64/;
specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance16x16/;
specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance16x8/;
specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance8x16/;
specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance8x8/;
specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance8x4/;
@ -1412,76 +1412,76 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_12_variance4x4/;
add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_12_get8x8var/;
specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc";
add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_12_get16x16var/;
specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance64x64/;
specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/;
specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance32x64/;
specialize qw/vp9_highbd_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/;
specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance64x32/;
specialize qw/vp9_highbd_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/;
specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance32x16/;
specialize qw/vp9_highbd_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/;
specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance16x32/;
specialize qw/vp9_highbd_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/;
specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance32x32/;
specialize qw/vp9_highbd_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/;
specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance16x16/;
specialize qw/vp9_highbd_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/;
specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance8x16/;
specialize qw/vp9_highbd_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/;
specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance16x8/;
specialize qw/vp9_highbd_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/;
specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance8x8/;
specialize qw/vp9_highbd_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/;
specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance8x4/;
specialize qw/vp9_highbd_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/;
specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance4x8/;
@ -1496,70 +1496,70 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_sub_pixel_avg_variance4x4/;
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance64x64/;
specialize qw/vp9_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance32x64/;
specialize qw/vp9_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance64x32/;
specialize qw/vp9_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance32x16/;
specialize qw/vp9_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance16x32/;
specialize qw/vp9_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance32x32/;
specialize qw/vp9_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance16x16/;
specialize qw/vp9_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance8x16/;
specialize qw/vp9_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance16x8/;
specialize qw/vp9_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance8x8/;
specialize qw/vp9_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance8x4/;
specialize qw/vp9_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance4x8/;
@ -1574,70 +1574,70 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x4/;
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance64x64/;
specialize qw/vp9_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance32x64/;
specialize qw/vp9_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance64x32/;
specialize qw/vp9_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance32x16/;
specialize qw/vp9_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance16x32/;
specialize qw/vp9_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance32x32/;
specialize qw/vp9_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance16x16/;
specialize qw/vp9_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance8x16/;
specialize qw/vp9_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance16x8/;
specialize qw/vp9_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance8x8/;
specialize qw/vp9_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance8x4/;
specialize qw/vp9_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance4x8/;
@ -1817,7 +1817,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_sad4x4x4d sse2/;
add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_mse16x16/;
specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_mse8x16/;
@ -1826,10 +1826,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_mse16x8/;
add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_mse8x8/;
specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_mse16x16/;
specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_mse8x16/;
@ -1838,10 +1838,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_10_mse16x8/;
add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_mse8x8/;
specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_mse16x16/;
specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_mse8x16/;
@ -1850,7 +1850,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_12_mse16x8/;
add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_mse8x8/;
specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc";
# ENCODEMB INVOKE

File diff suppressed because it is too large Load Diff

@ -0,0 +1,313 @@
;
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;unsigned int vp9_highbd_calc16x16var_sse2
;(
; unsigned char * src_ptr,
; int source_stride,
; unsigned char * ref_ptr,
; int recon_stride,
; unsigned int * SSE,
; int * Sum
;)
global sym(vp9_highbd_calc16x16var_sse2) PRIVATE
sym(vp9_highbd_calc16x16var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
push rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;[src_ptr]
mov rdi, arg(2) ;[ref_ptr]
movsxd rax, DWORD PTR arg(1) ;[source_stride]
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
add rax, rax ; source stride in bytes
add rdx, rdx ; recon stride in bytes
; Prefetch data
prefetcht0 [rsi]
prefetcht0 [rsi+16]
prefetcht0 [rsi+rax]
prefetcht0 [rsi+rax+16]
lea rbx, [rsi+rax*2]
prefetcht0 [rbx]
prefetcht0 [rbx+16]
prefetcht0 [rbx+rax]
prefetcht0 [rbx+rax+16]
prefetcht0 [rdi]
prefetcht0 [rdi+16]
prefetcht0 [rdi+rdx]
prefetcht0 [rdi+rdx+16]
lea rbx, [rdi+rdx*2]
prefetcht0 [rbx]
prefetcht0 [rbx+16]
prefetcht0 [rbx+rdx]
prefetcht0 [rbx+rdx+16]
pxor xmm0, xmm0 ; clear xmm0 for unpack
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
mov rcx, 16
.var16loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rdi]
lea rbx, [rsi+rax*2]
prefetcht0 [rbx]
prefetcht0 [rbx+16]
prefetcht0 [rbx+rax]
prefetcht0 [rbx+rax+16]
lea rbx, [rdi+rdx*2]
prefetcht0 [rbx]
prefetcht0 [rbx+16]
prefetcht0 [rbx+rdx]
prefetcht0 [rbx+rdx+16]
pxor xmm5, xmm5
psubw xmm1, xmm2
movdqu xmm3, XMMWORD PTR [rsi+16]
paddw xmm5, xmm1
pmaddwd xmm1, xmm1
movdqu xmm2, XMMWORD PTR [rdi+16]
paddd xmm6, xmm1
psubw xmm3, xmm2
movdqu xmm1, XMMWORD PTR [rsi+rax]
paddw xmm5, xmm3
pmaddwd xmm3, xmm3
movdqu xmm2, XMMWORD PTR [rdi+rdx]
paddd xmm6, xmm3
psubw xmm1, xmm2
movdqu xmm3, XMMWORD PTR [rsi+rax+16]
paddw xmm5, xmm1
pmaddwd xmm1, xmm1
movdqu xmm2, XMMWORD PTR [rdi+rdx+16]
paddd xmm6, xmm1
psubw xmm3, xmm2
paddw xmm5, xmm3
pmaddwd xmm3, xmm3
paddd xmm6, xmm3
movdqa xmm1, xmm5
movdqa xmm2, xmm5
pcmpgtw xmm1, xmm0
pcmpeqw xmm2, xmm0
por xmm1, xmm2
pcmpeqw xmm1, xmm0
movdqa xmm2, xmm5
punpcklwd xmm5, xmm1
punpckhwd xmm2, xmm1
paddd xmm7, xmm5
paddd xmm7, xmm2
lea rsi, [rsi + 2*rax]
lea rdi, [rdi + 2*rdx]
sub rcx, 2
jnz .var16loop
movdqa xmm4, xmm6
punpckldq xmm6, xmm0
punpckhdq xmm4, xmm0
movdqa xmm5, xmm7
paddd xmm6, xmm4
punpckldq xmm7, xmm0
punpckhdq xmm5, xmm0
paddd xmm7, xmm5
movdqa xmm4, xmm6
movdqa xmm5, xmm7
psrldq xmm4, 8
psrldq xmm5, 8
paddd xmm6, xmm4
paddd xmm7, xmm5
mov rdi, arg(4) ; [SSE]
mov rax, arg(5) ; [Sum]
movd DWORD PTR [rdi], xmm6
movd DWORD PTR [rax], xmm7
; begin epilog
pop rdi
pop rsi
pop rbx
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp9_highbd_calc8x8var_sse2
;(
; unsigned char * src_ptr,
; int source_stride,
; unsigned char * ref_ptr,
; int recon_stride,
; unsigned int * SSE,
; int * Sum
;)
global sym(vp9_highbd_calc8x8var_sse2) PRIVATE
sym(vp9_highbd_calc8x8var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
push rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;[src_ptr]
mov rdi, arg(2) ;[ref_ptr]
movsxd rax, DWORD PTR arg(1) ;[source_stride]
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
add rax, rax ; source stride in bytes
add rdx, rdx ; recon stride in bytes
; Prefetch data
prefetcht0 [rsi]
prefetcht0 [rsi+rax]
lea rbx, [rsi+rax*2]
prefetcht0 [rbx]
prefetcht0 [rbx+rax]
prefetcht0 [rdi]
prefetcht0 [rdi+rdx]
lea rbx, [rdi+rdx*2]
prefetcht0 [rbx]
prefetcht0 [rbx+rdx]
pxor xmm0, xmm0 ; clear xmm0 for unpack
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
mov rcx, 8
.var8loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rdi]
lea rbx, [rsi+rax*4]
prefetcht0 [rbx]
prefetcht0 [rbx+rax]
lea rbx, [rbx+rax*2]
prefetcht0 [rbx]
prefetcht0 [rbx+rax]
lea rbx, [rdi+rdx*4]
prefetcht0 [rbx]
prefetcht0 [rbx+rdx]
lea rbx, [rbx+rdx*2]
prefetcht0 [rbx]
prefetcht0 [rbx+rdx]
pxor xmm5, xmm5
psubw xmm1, xmm2
movdqu xmm3, XMMWORD PTR [rsi+rax]
paddw xmm5, xmm1
pmaddwd xmm1, xmm1
movdqu xmm2, XMMWORD PTR [rdi+rdx]
paddd xmm6, xmm1
lea rsi, [rsi + 2*rax]
lea rdi, [rdi + 2*rdx]
psubw xmm3, xmm2
movdqu xmm1, XMMWORD PTR [rsi]
paddw xmm5, xmm3
pmaddwd xmm3, xmm3
movdqu xmm2, XMMWORD PTR [rdi]
paddd xmm6, xmm3
psubw xmm1, xmm2
movdqu xmm3, XMMWORD PTR [rsi+rax]
paddw xmm5, xmm1
pmaddwd xmm1, xmm1
movdqu xmm2, XMMWORD PTR [rdi+rdx]
paddd xmm6, xmm1
psubw xmm3, xmm2
paddw xmm5, xmm3
pmaddwd xmm3, xmm3
paddd xmm6, xmm3
movdqa xmm1, xmm5
movdqa xmm2, xmm5
pcmpgtw xmm1, xmm0
pcmpeqw xmm2, xmm0
por xmm1, xmm2
pcmpeqw xmm1, xmm0
movdqa xmm2, xmm5
punpcklwd xmm5, xmm1
punpckhwd xmm2, xmm1
paddd xmm7, xmm5
paddd xmm7, xmm2
lea rsi, [rsi + 2*rax]
lea rdi, [rdi + 2*rdx]
sub rcx, 4
jnz .var8loop
movdqa xmm4, xmm6
punpckldq xmm6, xmm0
punpckhdq xmm4, xmm0
movdqa xmm5, xmm7
paddd xmm6, xmm4
punpckldq xmm7, xmm0
punpckhdq xmm5, xmm0
paddd xmm7, xmm5
movdqa xmm4, xmm6
movdqa xmm5, xmm7
psrldq xmm4, 8
psrldq xmm5, 8
paddd xmm6, xmm4
paddd xmm7, xmm5
mov rdi, arg(4) ; [SSE]
mov rax, arg(5) ; [Sum]
movd DWORD PTR [rdi], xmm6
movd DWORD PTR [rax], xmm7
; begin epilog
pop rdi
pop rsi
pop rbx
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret

@ -0,0 +1,580 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_config.h"
#include "vp9/common/vp9_common.h"
#include "vp9/encoder/vp9_variance.h"
#include "vpx_ports/mem.h"
typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
uint32_t *sse, int *sum);
uint32_t vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
uint32_t *sse, int *sum);
uint32_t vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
uint32_t *sse, int *sum);
static void highbd_variance_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
int w, int h, uint32_t *sse, int *sum,
high_variance_fn_t var_fn, int block_size) {
int i, j;
*sse = 0;
*sum = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
var_fn(src + src_stride * i + j, src_stride,
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
*sse += sse0;
*sum += sum0;
}
}
}
static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
int w, int h, uint32_t *sse, int *sum,
high_variance_fn_t var_fn, int block_size) {
int i, j;
uint64_t sse_long = 0;
int64_t sum_long = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
var_fn(src + src_stride * i + j, src_stride,
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
sse_long += sse0;
sum_long += sum0;
}
}
*sum = ROUND_POWER_OF_TWO(sum_long, 2);
*sse = ROUND_POWER_OF_TWO(sse_long, 4);
}
static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
int w, int h, uint32_t *sse, int *sum,
high_variance_fn_t var_fn, int block_size) {
int i, j;
uint64_t sse_long = 0;
int64_t sum_long = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
var_fn(src + src_stride * i + j, src_stride,
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
sse_long += sse0;
sum_long += sum0;
}
}
*sum = ROUND_POWER_OF_TWO(sum_long, 4);
*sse = ROUND_POWER_OF_TWO(sse_long, 8);
}
#define HIGH_GET_VAR(S) \
void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
uint32_t *sse, int *sum) { \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
sse, sum); \
} \
\
void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
uint32_t *sse, int *sum) { \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
sse, sum); \
*sum = ROUND_POWER_OF_TWO(*sum, 2); \
*sse = ROUND_POWER_OF_TWO(*sse, 4); \
} \
\
void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
uint32_t *sse, int *sum) { \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
sse, sum); \
*sum = ROUND_POWER_OF_TWO(*sum, 4); \
*sse = ROUND_POWER_OF_TWO(*sse, 8); \
}
HIGH_GET_VAR(16);
HIGH_GET_VAR(8);
#undef HIGH_GET_VAR
#define VAR_FN(w, h, block_size, shift) \
uint32_t vp9_highbd_variance##w##x##h##_sse2( \
const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
int sum; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
vp9_highbd_calc##block_size##x##block_size##var_sse2, \
block_size); \
return *sse - (((int64_t)sum * sum) >> shift); \
} \
\
uint32_t vp9_highbd_10_variance##w##x##h##_sse2( \
const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
int sum; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
highbd_10_variance_sse2( \
src, src_stride, ref, ref_stride, w, h, sse, &sum, \
vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
return *sse - (((int64_t)sum * sum) >> shift); \
} \
\
uint32_t vp9_highbd_12_variance##w##x##h##_sse2( \
const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
int sum; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
highbd_12_variance_sse2( \
src, src_stride, ref, ref_stride, w, h, sse, &sum, \
vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
return *sse - (((int64_t)sum * sum) >> shift); \
}
VAR_FN(64, 64, 16, 12);
VAR_FN(64, 32, 16, 11);
VAR_FN(32, 64, 16, 11);
VAR_FN(32, 32, 16, 10);
VAR_FN(32, 16, 16, 9);
VAR_FN(16, 32, 16, 9);
VAR_FN(16, 16, 16, 8);
VAR_FN(16, 8, 8, 7);
VAR_FN(8, 16, 8, 7);
VAR_FN(8, 8, 8, 6);
#undef VAR_FN
unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
return *sse;
}
unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
return *sse;
}
unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
return *sse;
}
unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
return *sse;
}
unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
return *sse;
}
unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
return *sse;
}
#define DECL(w, opt) \
int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint16_t *dst, \
ptrdiff_t dst_stride, \
int height, unsigned int *sse);
#define DECLS(opt1, opt2) \
DECL(8, opt1); \
DECL(16, opt1)
DECLS(sse2, sse);
// DECLS(ssse3, ssse3);
#undef DECLS
#undef DECL
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
uint32_t vp9_highbd_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst8, \
int dst_stride, \
uint32_t *sse_ptr) { \
uint32_t sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
x_offset, y_offset, \
dst, dst_stride, h, \
&sse); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
src_stride, \
x_offset, y_offset, \
dst + 16, \
dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 48, src_stride, x_offset, y_offset, \
dst + 48, dst_stride, h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
} \
\
uint32_t vp9_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
uint32_t sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
x_offset, y_offset, \
dst, dst_stride, \
h, &sse); \
if (w > wf) { \
uint32_t sse2; \
int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
src_stride, \
x_offset, y_offset, \
dst + 16, \
dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
se = ROUND_POWER_OF_TWO(se, 2); \
sse = ROUND_POWER_OF_TWO(sse, 4); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
} \
\
uint32_t vp9_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
int start_row; \
uint32_t sse; \
int se = 0; \
uint64_t long_sse = 0; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
for (start_row = 0; start_row < h; start_row +=16) { \
uint32_t sse2; \
int height = h - start_row < 16 ? h - start_row : 16; \
int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
src + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + (start_row * dst_stride), \
dst_stride, height, &sse2); \
se += se2; \
long_sse += sse2; \
if (w > wf) { \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 16 + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
dst_stride, height, &sse2); \
se += se2; \
long_sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 32 + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
dst_stride, height, &sse2); \
se += se2; \
long_sse += sse2; \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 48 + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
dst_stride, height, &sse2); \
se += se2; \
long_sse += sse2; \
}\
} \
} \
se = ROUND_POWER_OF_TWO(se, 4); \
sse = ROUND_POWER_OF_TWO(long_sse, 8); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
#define FNS(opt1, opt2) \
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
FN(8, 4, 8, 3, 2, opt1, (int64_t));
FNS(sse2, sse);
#undef FNS
#undef FN
#define DECL(w, opt) \
int vp9_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint16_t *dst, \
ptrdiff_t dst_stride, \
const uint16_t *sec, \
ptrdiff_t sec_stride, \
int height, \
unsigned int *sse);
#define DECLS(opt1) \
DECL(16, opt1) \
DECL(8, opt1)
DECLS(sse2);
#undef DECL
#undef DECLS
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
uint32_t vp9_highbd_sub_pixel_avg_variance##w##x##h##_##opt( \
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
const uint8_t *sec8) { \
uint32_t sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src, src_stride, x_offset, \
y_offset, dst, dst_stride, sec, w, h, &sse); \
if (w > wf) { \
uint32_t sse2; \
int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 16, src_stride, x_offset, y_offset, \
dst + 16, dst_stride, sec + 16, w, h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 32, src_stride, x_offset, y_offset, \
dst + 32, dst_stride, sec + 32, w, h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 48, src_stride, x_offset, y_offset, \
dst + 48, dst_stride, sec + 48, w, h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
} \
\
uint32_t vp9_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
const uint8_t *sec8) { \
uint32_t sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src, src_stride, x_offset, \
y_offset, dst, dst_stride, \
sec, w, h, &sse); \
if (w > wf) { \
uint32_t sse2; \
int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 16, src_stride, \
x_offset, y_offset, \
dst + 16, dst_stride, \
sec + 16, w, h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
sec + 32, w, h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
sec + 48, w, h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
se = ROUND_POWER_OF_TWO(se, 2); \
sse = ROUND_POWER_OF_TWO(sse, 4); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
} \
\
uint32_t vp9_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
const uint8_t *sec8) { \
int start_row; \
uint32_t sse; \
int se = 0; \
uint64_t long_sse = 0; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
for (start_row = 0; start_row < h; start_row +=16) { \
uint32_t sse2; \
int height = h - start_row < 16 ? h - start_row : 16; \
int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + (start_row * src_stride), src_stride, x_offset, \
y_offset, dst + (start_row * dst_stride), dst_stride, \
sec + (start_row * w), w, height, &sse2); \
se += se2; \
long_sse += sse2; \
if (w > wf) { \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 16 + (start_row * src_stride), src_stride, \
x_offset, y_offset, \
dst + 16 + (start_row * dst_stride), dst_stride, \
sec + 16 + (start_row * w), w, height, &sse2); \
se += se2; \
long_sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 32 + (start_row * src_stride), src_stride, \
x_offset, y_offset, \
dst + 32 + (start_row * dst_stride), dst_stride, \
sec + 32 + (start_row * w), w, height, &sse2); \
se += se2; \
long_sse += sse2; \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 48 + (start_row * src_stride), src_stride, \
x_offset, y_offset, \
dst + 48 + (start_row * dst_stride), dst_stride, \
sec + 48 + (start_row * w), w, height, &sse2); \
se += se2; \
long_sse += sse2; \
} \
} \
} \
se = ROUND_POWER_OF_TWO(se, 4); \
sse = ROUND_POWER_OF_TWO(long_sse, 8); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
#define FNS(opt1) \
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
FN(8, 4, 8, 3, 2, opt1, (int64_t));
FNS(sse2);
#undef FNS
#undef FN

@ -104,6 +104,7 @@ VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad4d_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm
endif
ifeq ($(CONFIG_USE_X86INC),yes)
@ -115,6 +116,8 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm
endif
endif