Hadamard transform based coding mode decision process

This commit uses Hadamard transform based rate-distortion cost
estimate for rtc coding mode decision. It improves the compression
performance of speed -6 for many hard clips at lower bit-rates.
For example, 5.5% for jimredvga, 6.7% for mmmoving, 6.1% for
niklas720p. This will introduce extra encoding cycle costs at
this point.

Change-Id: Iaf70634fa2417a705ee29f2456175b981db3d375
This commit is contained in:
Jingning Han 2015-03-23 10:02:42 -07:00
parent ba13ff8501
commit 8c411f74e0
4 changed files with 400 additions and 22 deletions

View File

@ -1109,6 +1109,15 @@ specialize qw/vp9_avg_8x8 sse2 neon/;
add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
specialize qw/vp9_avg_4x4 sse2/;
add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
specialize qw/vp9_hadamard_8x8 sse2/;
add_proto qw/void vp9_hadamard_16x16/, "int16_t *coeff";
specialize qw/vp9_hadamard_16x16/;
add_proto qw/int16_t vp9_satd/, "const int16_t *coeff, int length";
specialize qw/vp9_satd sse2/;
add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
specialize qw/vp9_int_pro_row sse2/;

View File

@ -28,6 +28,87 @@ unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) {
return (sum + 8) >> 4;
}
static void hadamard_col8(const int16_t *src_diff, int src_stride,
int16_t *coeff) {
int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
int16_t c0 = b0 + b2;
int16_t c1 = b1 + b3;
int16_t c2 = b0 - b2;
int16_t c3 = b1 - b3;
int16_t c4 = b4 + b6;
int16_t c5 = b5 + b7;
int16_t c6 = b4 - b6;
int16_t c7 = b5 - b7;
coeff[0] = c0 + c4;
coeff[7] = c1 + c5;
coeff[3] = c2 + c6;
coeff[4] = c3 + c7;
coeff[2] = c0 - c4;
coeff[6] = c1 - c5;
coeff[1] = c2 - c6;
coeff[5] = c3 - c7;
}
void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
int16_t *coeff) {
int idx;
int16_t buffer[64];
int16_t *tmp_buf = &buffer[0];
for (idx = 0; idx < 8; ++idx) {
hadamard_col8(src_diff, src_stride, tmp_buf);
tmp_buf += 8;
++src_diff;
}
tmp_buf = &buffer[0];
for (idx = 0; idx < 8; ++idx) {
hadamard_col8(tmp_buf, 8, coeff);
coeff += 8;
++tmp_buf;
}
}
// In place 16x16 2D Hadamard transform
void vp9_hadamard_16x16_c(int16_t *coeff) {
int idx;
for (idx = 0; idx < 64; ++idx) {
int16_t a0 = coeff[0];
int16_t a1 = coeff[64];
int16_t a2 = coeff[128];
int16_t a3 = coeff[192];
int16_t b0 = a0 + a1;
int16_t b1 = a0 - a1;
int16_t b2 = a2 + a3;
int16_t b3 = a2 - a3;
coeff[0] = (b0 + b2) >> 1;
coeff[64] = (b1 + b3) >> 1;
coeff[128] = (b0 - b2) >> 1;
coeff[192] = (b1 - b3) >> 1;
++coeff;
}
}
int16_t vp9_satd_c(const int16_t *coeff, int length) {
int i;
int satd = 0;
for (i = 0; i < length; ++i)
satd += abs(coeff[i]);
return (int16_t)satd;
}
// Integer projection onto row vectors.
void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref,
const int ref_stride, const int height) {

View File

@ -20,9 +20,11 @@
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_mvref_common.h"
#include "vp9/common/vp9_pred_common.h"
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_reconintra.h"
#include "vp9/encoder/vp9_cost.h"
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_pickmode.h"
#include "vp9/encoder/vp9_ratectrl.h"
@ -188,6 +190,8 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
cond_cost_list(cpi, cost_list),
x->nmvjointcost, x->mvcost,
&dis, &x->pred_sse[ref], NULL, 0, 0);
*rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
}
if (scaled_ref_frame) {
@ -198,7 +202,6 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
return rv;
}
static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
MACROBLOCK *x, MACROBLOCKD *xd,
int *out_rate_sum, int64_t *out_dist_sum,
@ -312,6 +315,105 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
*out_dist_sum += dist << 4;
}
static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
int *skippable, int64_t *sse, int plane,
BLOCK_SIZE bsize, TX_SIZE tx_size) {
MACROBLOCKD *xd = &x->e_mbd;
const struct macroblockd_plane *pd = &xd->plane[plane];
const struct macroblock_plane *const p = &x->plane[plane];
const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
const int step = 1 << (tx_size << 1);
const int block_step = (1 << tx_size);
int block = 0, r, c;
int shift = tx_size == TX_32X32 ? 0 : 2;
const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
xd->mb_to_right_edge >> (5 + pd->subsampling_x));
const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
#if CONFIG_VP9_HIGHBITDEPTH
unsigned int var_y, sse_y;
model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
*sse = INT_MAX;
*skippable = 0;
return;
#else
(void)cpi;
#endif
vp9_subtract_plane(x, bsize, plane);
*skippable = 1;
*rate = 0;
*dist = 0;
*sse = 0;
// Keep track of the row and column of the blocks we use so that we know
// if we are in the unrestricted motion border.
for (r = 0; r < max_blocks_high; r += block_step) {
for (c = 0; c < num_4x4_w; c += block_step) {
if (c < max_blocks_wide) {
const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
uint16_t *const eob = &p->eobs[block];
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
int i, j;
const int16_t *src_diff;
int64_t this_sse;
txfrm_block_to_raster_xy(bsize, tx_size, block, &i, &j);
src_diff = &p->src_diff[4 * (j * diff_stride + i)];
switch (tx_size) {
case TX_32X32:
vp9_fdct32x32_rd(src_diff, coeff, diff_stride);
vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
p->round_fp, p->quant_fp, p->quant_shift,
qcoeff, dqcoeff, pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_16X16:
vp9_fdct16x16(src_diff, coeff, diff_stride);
vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_8X8:
vp9_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
case TX_4X4:
x->fwd_txm4x4(src_diff, coeff, diff_stride);
vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
default:
assert(0);
break;
}
*dist += vp9_block_error(coeff, dqcoeff, step << 4, &this_sse) >> shift;
*rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4);
*sse += (this_sse >> shift);
*skippable &= (*eob == 0);
}
block += step;
}
}
*rate <<= 8;
*rate *= 6;
}
static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize,
MACROBLOCK *x, MACROBLOCKD *xd,
int *out_rate_sum, int64_t *out_dist_sum,
@ -518,7 +620,9 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
int i, j;
int rate;
int64_t dist;
unsigned int var_y, sse_y;
int64_t this_sse;
int is_skippable;
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
assert(plane == 0);
(void) plane;
@ -533,8 +637,16 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
x->skip_encode ? src_stride : dst_stride,
pd->dst.buf, dst_stride,
i, j, 0);
// This procedure assumes zero offset from p->src.buf and pd->dst.buf.
model_rd_for_sb_y(cpi, bsize_tx, x, xd, &rate, &dist, &var_y, &sse_y);
// TODO(jingning): This needs further refactoring.
block_yrd(cpi, x, &rate, &dist, &is_skippable, &this_sse, 0,
bsize_tx, tx_size);
x->skip_txfm[0] = is_skippable;
if (is_skippable)
rate = vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 1);
else
rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 0);
p->src.buf = src_buf_base;
pd->dst.buf = dst_buf_base;
args->rate += rate;
@ -602,10 +714,6 @@ void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
*rd_cost = best_rdc;
}
static const int ref_frame_cost[MAX_REF_FRAMES] = {
1235, 229, 530, 615,
};
typedef struct {
MV_REFERENCE_FRAME ref_frame;
PREDICTION_MODE pred_mode;
@ -682,6 +790,20 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
int ref_frame_skip_mask = 0;
int idx;
int best_pred_sad = INT_MAX;
int ref_frame_cost[MAX_REF_FRAMES];
vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
ref_frame_cost[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
ref_frame_cost[LAST_FRAME] = ref_frame_cost[GOLDEN_FRAME] =
ref_frame_cost[ALTREF_FRAME] = vp9_cost_bit(intra_inter_p, 1);
ref_frame_cost[LAST_FRAME] += vp9_cost_bit(ref_single_p1, 0);
ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
if (reuse_inter_pred) {
int i;
@ -773,6 +895,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
int mode_index;
int i;
PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode;
int64_t this_sse;
int is_skippable;
if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
continue;
@ -935,17 +1060,40 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
vp9_get_switchable_rate(cpi, xd) : 0;
}
// TODO(jingning): disable color operations temporarily.
// chroma component rate-distortion cost modeling
if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
int uv_rate = 0;
int64_t uv_dist = 0;
if (x->color_sensitivity[0])
vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
if (x->color_sensitivity[1])
vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist, &var_y, &sse_y);
this_rdc.rate += uv_rate;
this_rdc.dist += uv_dist;
// if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
// int uv_rate = 0;
// int64_t uv_dist = 0;
// if (x->color_sensitivity[0])
// vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
// if (x->color_sensitivity[1])
// vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
// model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist,
// &var_y, &sse_y);
// this_rdc.rate += uv_rate;
// this_rdc.dist += uv_dist;
// }
vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable, &this_sse,
0, bsize, mbmi->tx_size);
x->skip_txfm[0] = is_skippable;
if (is_skippable) {
this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
} else {
if (RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist) <
RDCOST(x->rdmult, x->rddiv, 0, this_sse)) {
this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
} else {
this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
this_rdc.dist = this_sse;
}
}
if (cm->interp_filter == SWITCHABLE) {
if ((mbmi->mv[0].as_mv.row | mbmi->mv[0].as_mv.col) & 0x07)
this_rdc.rate += vp9_get_switchable_rate(cpi, xd);
}
this_rdc.rate += rate_mv;
@ -1042,6 +1190,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
const PREDICTION_MODE this_mode = intra_mode_list[i];
if (!((1 << this_mode) & cpi->sf.intra_y_mode_mask[intra_tx_size]))
continue;
mbmi->mode = this_mode;
mbmi->ref_frame[0] = INTRA_FRAME;
args.mode = this_mode;
args.rate = 0;
args.dist = 0;
@ -1058,17 +1208,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (this_rdc.rdcost < best_rdc.rdcost) {
best_rdc = this_rdc;
mbmi->mode = this_mode;
best_mode = this_mode;
best_intra_tx_size = mbmi->tx_size;
mbmi->ref_frame[0] = INTRA_FRAME;
best_ref_frame = INTRA_FRAME;
mbmi->uv_mode = this_mode;
mbmi->mv[0].as_int = INVALID_MV;
best_mode_skip_txfm = x->skip_txfm[0];
}
}
// Reset mb_mode_info to the best inter mode.
if (mbmi->ref_frame[0] != INTRA_FRAME) {
x->skip_txfm[0] = best_mode_skip_txfm;
if (best_ref_frame != INTRA_FRAME) {
mbmi->tx_size = best_tx_size;
} else {
mbmi->tx_size = best_intra_tx_size;
@ -1076,6 +1226,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
pd->dst = orig_dst;
mbmi->mode = best_mode;
mbmi->ref_frame[0] = best_ref_frame;
x->skip_txfm[0] = best_mode_skip_txfm;
if (reuse_inter_pred && best_pred != NULL) {
if (best_pred->data != orig_dst.buf && is_inter_mode(mbmi->mode)) {

View File

@ -57,6 +57,141 @@ unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) {
return (avg + 8) >> 4;
}
static void hadamard_col8_sse2(__m128i *in, int iter) {
__m128i a0 = in[0];
__m128i a1 = in[1];
__m128i a2 = in[2];
__m128i a3 = in[3];
__m128i a4 = in[4];
__m128i a5 = in[5];
__m128i a6 = in[6];
__m128i a7 = in[7];
__m128i b0 = _mm_add_epi16(a0, a1);
__m128i b1 = _mm_sub_epi16(a0, a1);
__m128i b2 = _mm_add_epi16(a2, a3);
__m128i b3 = _mm_sub_epi16(a2, a3);
__m128i b4 = _mm_add_epi16(a4, a5);
__m128i b5 = _mm_sub_epi16(a4, a5);
__m128i b6 = _mm_add_epi16(a6, a7);
__m128i b7 = _mm_sub_epi16(a6, a7);
a0 = _mm_add_epi16(b0, b2);
a1 = _mm_add_epi16(b1, b3);
a2 = _mm_sub_epi16(b0, b2);
a3 = _mm_sub_epi16(b1, b3);
a4 = _mm_add_epi16(b4, b6);
a5 = _mm_add_epi16(b5, b7);
a6 = _mm_sub_epi16(b4, b6);
a7 = _mm_sub_epi16(b5, b7);
if (iter == 0) {
b0 = _mm_add_epi16(a0, a4);
b1 = _mm_add_epi16(a1, a5);
b2 = _mm_add_epi16(a2, a6);
b3 = _mm_add_epi16(a3, a7);
b4 = _mm_sub_epi16(a0, a4);
b5 = _mm_sub_epi16(a1, a5);
b6 = _mm_sub_epi16(a2, a6);
b7 = _mm_sub_epi16(a3, a7);
a0 = _mm_unpacklo_epi16(b0, b1);
a1 = _mm_unpacklo_epi16(b2, b3);
a2 = _mm_unpackhi_epi16(b0, b1);
a3 = _mm_unpackhi_epi16(b2, b3);
a4 = _mm_unpacklo_epi16(b4, b5);
a5 = _mm_unpacklo_epi16(b6, b7);
a6 = _mm_unpackhi_epi16(b4, b5);
a7 = _mm_unpackhi_epi16(b6, b7);
b0 = _mm_unpacklo_epi32(a0, a1);
b1 = _mm_unpacklo_epi32(a4, a5);
b2 = _mm_unpackhi_epi32(a0, a1);
b3 = _mm_unpackhi_epi32(a4, a5);
b4 = _mm_unpacklo_epi32(a2, a3);
b5 = _mm_unpacklo_epi32(a6, a7);
b6 = _mm_unpackhi_epi32(a2, a3);
b7 = _mm_unpackhi_epi32(a6, a7);
in[0] = _mm_unpacklo_epi64(b0, b1);
in[7] = _mm_unpackhi_epi64(b0, b1);
in[3] = _mm_unpacklo_epi64(b2, b3);
in[4] = _mm_unpackhi_epi64(b2, b3);
in[2] = _mm_unpacklo_epi64(b4, b5);
in[6] = _mm_unpackhi_epi64(b4, b5);
in[1] = _mm_unpacklo_epi64(b6, b7);
in[5] = _mm_unpackhi_epi64(b6, b7);
} else {
in[0] = _mm_add_epi16(a0, a4);
in[7] = _mm_add_epi16(a1, a5);
in[3] = _mm_add_epi16(a2, a6);
in[4] = _mm_add_epi16(a3, a7);
in[2] = _mm_sub_epi16(a0, a4);
in[6] = _mm_sub_epi16(a1, a5);
in[1] = _mm_sub_epi16(a2, a6);
in[5] = _mm_sub_epi16(a3, a7);
}
}
void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
int16_t *coeff) {
__m128i src[8];
src[0] = _mm_load_si128((const __m128i *)src_diff);
src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
hadamard_col8_sse2(src, 0);
hadamard_col8_sse2(src, 1);
_mm_storeu_si128((__m128i *)coeff, src[0]);
coeff += 8;
_mm_storeu_si128((__m128i *)coeff, src[1]);
coeff += 8;
_mm_storeu_si128((__m128i *)coeff, src[2]);
coeff += 8;
_mm_storeu_si128((__m128i *)coeff, src[3]);
coeff += 8;
_mm_storeu_si128((__m128i *)coeff, src[4]);
coeff += 8;
_mm_storeu_si128((__m128i *)coeff, src[5]);
coeff += 8;
_mm_storeu_si128((__m128i *)coeff, src[6]);
coeff += 8;
_mm_storeu_si128((__m128i *)coeff, src[7]);
}
int16_t vp9_satd_sse2(const int16_t *coeff, int length) {
int i;
__m128i sum = _mm_load_si128((const __m128i *)coeff);
__m128i sign = _mm_srai_epi16(sum, 15);
__m128i val = _mm_xor_si128(sum, sign);
sum = _mm_sub_epi16(val, sign);
coeff += 8;
for (i = 8; i < length; i += 8) {
__m128i src_line = _mm_load_si128((const __m128i *)coeff);
sign = _mm_srai_epi16(src_line, 15);
val = _mm_xor_si128(src_line, sign);
val = _mm_sub_epi16(val, sign);
sum = _mm_add_epi16(sum, val);
coeff += 8;
}
val = _mm_srli_si128(sum, 8);
sum = _mm_add_epi16(sum, val);
val = _mm_srli_epi64(sum, 32);
sum = _mm_add_epi16(sum, val);
val = _mm_srli_epi32(sum, 16);
sum = _mm_add_epi16(sum, val);
return _mm_extract_epi16(sum, 0);
}
void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
const int ref_stride, const int height) {
int idx;