Merge "Force_split on 16x16 blocks in variance partition."
This commit is contained in:
@@ -1114,6 +1114,9 @@ specialize qw/vp9_avg_8x8 sse2 neon/;
|
|||||||
add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
|
add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
|
||||||
specialize qw/vp9_avg_4x4 sse2/;
|
specialize qw/vp9_avg_4x4 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/void vp9_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
|
||||||
|
specialize qw/vp9_minmax_8x8 sse2/;
|
||||||
|
|
||||||
add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
|
add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
|
||||||
specialize qw/vp9_hadamard_8x8 sse2/, "$ssse3_x86_64";
|
specialize qw/vp9_hadamard_8x8 sse2/, "$ssse3_x86_64";
|
||||||
|
|
||||||
@@ -1137,6 +1140,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
|||||||
specialize qw/vp9_highbd_avg_8x8/;
|
specialize qw/vp9_highbd_avg_8x8/;
|
||||||
add_proto qw/unsigned int vp9_highbd_avg_4x4/, "const uint8_t *, int p";
|
add_proto qw/unsigned int vp9_highbd_avg_4x4/, "const uint8_t *, int p";
|
||||||
specialize qw/vp9_highbd_avg_4x4/;
|
specialize qw/vp9_highbd_avg_4x4/;
|
||||||
|
add_proto qw/unsigned int vp9_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
|
||||||
|
specialize qw/vp9_highbd_minmax_8x8/;
|
||||||
}
|
}
|
||||||
|
|
||||||
# ENCODEMB INVOKE
|
# ENCODEMB INVOKE
|
||||||
|
|||||||
@@ -155,6 +155,20 @@ int vp9_vector_var_c(int16_t const *ref, int16_t const *src,
|
|||||||
return var;
|
return var;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
|
||||||
|
int *min, int *max) {
|
||||||
|
int i, j;
|
||||||
|
*min = 255;
|
||||||
|
*max = 0;
|
||||||
|
for (i = 0; i < 8; ++i, s += p, d += dp) {
|
||||||
|
for (j = 0; j < 8; ++j) {
|
||||||
|
int diff = abs(s[j]-d[j]);
|
||||||
|
*min = diff < *min ? diff : *min;
|
||||||
|
*max = diff > *max ? diff : *max;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#if CONFIG_VP9_HIGHBITDEPTH
|
#if CONFIG_VP9_HIGHBITDEPTH
|
||||||
unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
|
unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
|
||||||
int i, j;
|
int i, j;
|
||||||
@@ -175,6 +189,22 @@ unsigned int vp9_highbd_avg_4x4_c(const uint8_t *s8, int p) {
|
|||||||
|
|
||||||
return (sum + 8) >> 4;
|
return (sum + 8) >> 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void vp9_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
|
||||||
|
int dp, int *min, int *max) {
|
||||||
|
int i, j;
|
||||||
|
*min = 255;
|
||||||
|
*max = 0;
|
||||||
|
const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
|
||||||
|
const uint16_t* d = CONVERT_TO_SHORTPTR(d8);
|
||||||
|
for (i = 0; i < 8; ++i, s += p, d += dp) {
|
||||||
|
for (j = 0; j < 8; ++j) {
|
||||||
|
int diff = abs(s[j]-d[j]);
|
||||||
|
*min = diff < *min ? diff : *min;
|
||||||
|
*max = diff > *max ? diff : *max;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -390,18 +390,21 @@ static int set_vt_partitioning(VP9_COMP *cpi,
|
|||||||
variance_node vt;
|
variance_node vt;
|
||||||
const int block_width = num_8x8_blocks_wide_lookup[bsize];
|
const int block_width = num_8x8_blocks_wide_lookup[bsize];
|
||||||
const int block_height = num_8x8_blocks_high_lookup[bsize];
|
const int block_height = num_8x8_blocks_high_lookup[bsize];
|
||||||
|
const int low_res = (cm->width <= 352 && cm->height <= 288);
|
||||||
|
|
||||||
assert(block_height == block_width);
|
assert(block_height == block_width);
|
||||||
tree_to_node(data, bsize, &vt);
|
tree_to_node(data, bsize, &vt);
|
||||||
|
|
||||||
if (force_split)
|
if (force_split == 1)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
// For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
|
// For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
|
||||||
// variance is below threshold, otherwise split will be selected.
|
// variance is below threshold, otherwise split will be selected.
|
||||||
// No check for vert/horiz split as too few samples for variance.
|
// No check for vert/horiz split as too few samples for variance.
|
||||||
if (bsize == bsize_min) {
|
if (bsize == bsize_min) {
|
||||||
get_variance(&vt.part_variances->none);
|
// Variance already computed to set the force_split.
|
||||||
|
if (low_res || cm->frame_type == KEY_FRAME)
|
||||||
|
get_variance(&vt.part_variances->none);
|
||||||
if (mi_col + block_width / 2 < cm->mi_cols &&
|
if (mi_col + block_width / 2 < cm->mi_cols &&
|
||||||
mi_row + block_height / 2 < cm->mi_rows &&
|
mi_row + block_height / 2 < cm->mi_rows &&
|
||||||
vt.part_variances->none.variance < threshold) {
|
vt.part_variances->none.variance < threshold) {
|
||||||
@@ -410,11 +413,10 @@ static int set_vt_partitioning(VP9_COMP *cpi,
|
|||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
} else if (bsize > bsize_min) {
|
} else if (bsize > bsize_min) {
|
||||||
// Variance is already computed for 32x32 blocks to set the force_split.
|
// Variance already computed to set the force_split.
|
||||||
if (bsize != BLOCK_32X32)
|
if (low_res || cm->frame_type == KEY_FRAME)
|
||||||
get_variance(&vt.part_variances->none);
|
get_variance(&vt.part_variances->none);
|
||||||
// For key frame or low_res: for bsize above 32X32 or very high variance,
|
// For key frame: take split for bsize above 32X32 or very high variance.
|
||||||
// take split.
|
|
||||||
if (cm->frame_type == KEY_FRAME &&
|
if (cm->frame_type == KEY_FRAME &&
|
||||||
(bsize > BLOCK_32X32 ||
|
(bsize > BLOCK_32X32 ||
|
||||||
vt.part_variances->none.variance > (threshold << 4))) {
|
vt.part_variances->none.variance > (threshold << 4))) {
|
||||||
@@ -484,21 +486,68 @@ void vp9_set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
|
|||||||
thresholds[1] = threshold_base >> 2;
|
thresholds[1] = threshold_base >> 2;
|
||||||
thresholds[2] = threshold_base >> 2;
|
thresholds[2] = threshold_base >> 2;
|
||||||
thresholds[3] = threshold_base << 2;
|
thresholds[3] = threshold_base << 2;
|
||||||
|
cpi->vbp_threshold_sad = 0;
|
||||||
cpi->vbp_bsize_min = BLOCK_8X8;
|
cpi->vbp_bsize_min = BLOCK_8X8;
|
||||||
} else {
|
} else {
|
||||||
thresholds[1] = threshold_base;
|
thresholds[1] = threshold_base;
|
||||||
if (cm->width <= 352 && cm->height <= 288) {
|
if (cm->width <= 352 && cm->height <= 288) {
|
||||||
thresholds[0] = threshold_base >> 2;
|
thresholds[0] = threshold_base >> 2;
|
||||||
thresholds[2] = threshold_base << 3;
|
thresholds[2] = threshold_base << 3;
|
||||||
|
cpi->vbp_threshold_sad = 100;
|
||||||
} else {
|
} else {
|
||||||
thresholds[0] = threshold_base;
|
thresholds[0] = threshold_base;
|
||||||
|
thresholds[1] = (5 * threshold_base) >> 2;
|
||||||
thresholds[2] = threshold_base << cpi->oxcf.speed;
|
thresholds[2] = threshold_base << cpi->oxcf.speed;
|
||||||
|
cpi->vbp_threshold_sad = 1000;
|
||||||
}
|
}
|
||||||
cpi->vbp_bsize_min = BLOCK_16X16;
|
cpi->vbp_bsize_min = BLOCK_16X16;
|
||||||
}
|
}
|
||||||
|
cpi->vbp_threshold_minmax = 15 + (q >> 3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Compute the minmax over the 8x8 subblocks.
|
||||||
|
static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
|
||||||
|
int dp, int x16_idx, int y16_idx,
|
||||||
|
#if CONFIG_VP9_HIGHBITDEPTH
|
||||||
|
int highbd_flag,
|
||||||
|
#endif
|
||||||
|
int pixels_wide,
|
||||||
|
int pixels_high) {
|
||||||
|
int k;
|
||||||
|
int minmax_max = 0;
|
||||||
|
int minmax_min = 255;
|
||||||
|
// Loop over the 4 8x8 subblocks.
|
||||||
|
for (k = 0; k < 4; k++) {
|
||||||
|
int x8_idx = x16_idx + ((k & 1) << 3);
|
||||||
|
int y8_idx = y16_idx + ((k >> 1) << 3);
|
||||||
|
int min = 0;
|
||||||
|
int max = 0;
|
||||||
|
if (x8_idx < pixels_wide && y8_idx < pixels_high) {
|
||||||
|
#if CONFIG_VP9_HIGHBITDEPTH
|
||||||
|
if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
|
||||||
|
vp9_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
|
||||||
|
d + y8_idx * dp + x8_idx, dp,
|
||||||
|
&min, &max);
|
||||||
|
} else {
|
||||||
|
vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
|
||||||
|
d + y8_idx * dp + x8_idx, dp,
|
||||||
|
&min, &max);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
|
||||||
|
d + y8_idx * dp + x8_idx, dp,
|
||||||
|
&min, &max);
|
||||||
|
#endif
|
||||||
|
if ((max - min) > minmax_max)
|
||||||
|
minmax_max = (max - min);
|
||||||
|
if ((max - min) < minmax_min)
|
||||||
|
minmax_min = (max - min);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return (minmax_max - minmax_min);
|
||||||
|
}
|
||||||
|
|
||||||
static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
|
static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
|
||||||
int dp, int x8_idx, int y8_idx, v8x8 *vst,
|
int dp, int x8_idx, int y8_idx, v8x8 *vst,
|
||||||
#if CONFIG_VP9_HIGHBITDEPTH
|
#if CONFIG_VP9_HIGHBITDEPTH
|
||||||
@@ -579,7 +628,7 @@ static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
|
|||||||
|
|
||||||
// This function chooses partitioning based on the variance between source and
|
// This function chooses partitioning based on the variance between source and
|
||||||
// reconstructed last, where variance is computed for down-sampled inputs.
|
// reconstructed last, where variance is computed for down-sampled inputs.
|
||||||
static void choose_partitioning(VP9_COMP *cpi,
|
static int choose_partitioning(VP9_COMP *cpi,
|
||||||
const TileInfo *const tile,
|
const TileInfo *const tile,
|
||||||
MACROBLOCK *x,
|
MACROBLOCK *x,
|
||||||
int mi_row, int mi_col) {
|
int mi_row, int mi_col) {
|
||||||
@@ -588,7 +637,7 @@ static void choose_partitioning(VP9_COMP *cpi,
|
|||||||
int i, j, k, m;
|
int i, j, k, m;
|
||||||
v64x64 vt;
|
v64x64 vt;
|
||||||
v16x16 vt2[16];
|
v16x16 vt2[16];
|
||||||
int force_split[5];
|
int force_split[21];
|
||||||
uint8_t *s;
|
uint8_t *s;
|
||||||
const uint8_t *d;
|
const uint8_t *d;
|
||||||
int sp;
|
int sp;
|
||||||
@@ -684,6 +733,19 @@ static void choose_partitioning(VP9_COMP *cpi,
|
|||||||
|
|
||||||
d = xd->plane[0].dst.buf;
|
d = xd->plane[0].dst.buf;
|
||||||
dp = xd->plane[0].dst.stride;
|
dp = xd->plane[0].dst.stride;
|
||||||
|
|
||||||
|
// If the y_sad is very small, take 64x64 as partition and exit.
|
||||||
|
// Don't check on boosted segment for now, as 64x64 is suppressed there.
|
||||||
|
if (segment_id == CR_SEGMENT_ID_BASE &&
|
||||||
|
y_sad < cpi->vbp_threshold_sad) {
|
||||||
|
const int block_width = num_8x8_blocks_wide_lookup[BLOCK_64X64];
|
||||||
|
const int block_height = num_8x8_blocks_high_lookup[BLOCK_64X64];
|
||||||
|
if (mi_col + block_width / 2 < cm->mi_cols &&
|
||||||
|
mi_row + block_height / 2 < cm->mi_rows) {
|
||||||
|
set_block_size(cpi, xd, mi_row, mi_col, BLOCK_64X64);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
d = VP9_VAR_OFFS;
|
d = VP9_VAR_OFFS;
|
||||||
dp = 0;
|
dp = 0;
|
||||||
@@ -706,6 +768,7 @@ static void choose_partitioning(VP9_COMP *cpi,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
|
// Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
|
||||||
|
// 5-20 for the 16x16 blocks.
|
||||||
force_split[0] = 0;
|
force_split[0] = 0;
|
||||||
// Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
|
// Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
|
||||||
// for splits.
|
// for splits.
|
||||||
@@ -717,7 +780,9 @@ static void choose_partitioning(VP9_COMP *cpi,
|
|||||||
for (j = 0; j < 4; j++) {
|
for (j = 0; j < 4; j++) {
|
||||||
const int x16_idx = x32_idx + ((j & 1) << 4);
|
const int x16_idx = x32_idx + ((j & 1) << 4);
|
||||||
const int y16_idx = y32_idx + ((j >> 1) << 4);
|
const int y16_idx = y32_idx + ((j >> 1) << 4);
|
||||||
|
const int split_index = 5 + i2 + j;
|
||||||
v16x16 *vst = &vt.split[i].split[j];
|
v16x16 *vst = &vt.split[i].split[j];
|
||||||
|
force_split[split_index] = 0;
|
||||||
variance4x4downsample[i2 + j] = 0;
|
variance4x4downsample[i2 + j] = 0;
|
||||||
if (!is_key_frame) {
|
if (!is_key_frame) {
|
||||||
fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst,
|
fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst,
|
||||||
@@ -728,15 +793,36 @@ static void choose_partitioning(VP9_COMP *cpi,
|
|||||||
pixels_high,
|
pixels_high,
|
||||||
is_key_frame);
|
is_key_frame);
|
||||||
fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
|
fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
|
||||||
// For low-resolution, compute the variance based on 8x8 down-sampling,
|
get_variance(&vt.split[i].split[j].part_variances.none);
|
||||||
// and if it is large (above the threshold) we go down for 4x4.
|
if (vt.split[i].split[j].part_variances.none.variance >
|
||||||
// For key frame we always go down to 4x4.
|
thresholds[2]) {
|
||||||
if (low_res)
|
// 16X16 variance is above threshold for split, so force split to 8x8
|
||||||
get_variance(&vt.split[i].split[j].part_variances.none);
|
// for this 16x16 block (this also forces splits for upper levels).
|
||||||
|
force_split[split_index] = 1;
|
||||||
|
force_split[i + 1] = 1;
|
||||||
|
force_split[0] = 1;
|
||||||
|
} else if (vt.split[i].split[j].part_variances.none.variance >
|
||||||
|
thresholds[1] &&
|
||||||
|
!cyclic_refresh_segment_id_boosted(segment_id)) {
|
||||||
|
// We have some nominal amount of 16x16 variance (based on average),
|
||||||
|
// compute the minmax over the 8x8 sub-blocks, and if above threshold,
|
||||||
|
// force split to 8x8 block for this 16x16 block.
|
||||||
|
int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx,
|
||||||
|
#if CONFIG_VP9_HIGHBITDEPTH
|
||||||
|
xd->cur_buf->flags,
|
||||||
|
#endif
|
||||||
|
pixels_wide, pixels_high);
|
||||||
|
if (minmax > cpi->vbp_threshold_minmax) {
|
||||||
|
force_split[split_index] = 1;
|
||||||
|
force_split[i + 1] = 1;
|
||||||
|
force_split[0] = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (is_key_frame || (low_res &&
|
if (is_key_frame || (low_res &&
|
||||||
vt.split[i].split[j].part_variances.none.variance >
|
vt.split[i].split[j].part_variances.none.variance >
|
||||||
(thresholds[1] << 1))) {
|
(thresholds[1] << 1))) {
|
||||||
|
force_split[split_index] = 0;
|
||||||
// Go down to 4x4 down-sampling for variance.
|
// Go down to 4x4 down-sampling for variance.
|
||||||
variance4x4downsample[i2 + j] = 1;
|
variance4x4downsample[i2 + j] = 1;
|
||||||
for (k = 0; k < 4; k++) {
|
for (k = 0; k < 4; k++) {
|
||||||
@@ -771,16 +857,20 @@ static void choose_partitioning(VP9_COMP *cpi,
|
|||||||
fill_variance_tree(&vt.split[i], BLOCK_32X32);
|
fill_variance_tree(&vt.split[i], BLOCK_32X32);
|
||||||
// If variance of this 32x32 block is above the threshold, force the block
|
// If variance of this 32x32 block is above the threshold, force the block
|
||||||
// to split. This also forces a split on the upper (64x64) level.
|
// to split. This also forces a split on the upper (64x64) level.
|
||||||
get_variance(&vt.split[i].part_variances.none);
|
if (!force_split[i + 1]) {
|
||||||
if (vt.split[i].part_variances.none.variance > thresholds[1]) {
|
get_variance(&vt.split[i].part_variances.none);
|
||||||
force_split[i + 1] = 1;
|
if (vt.split[i].part_variances.none.variance > thresholds[1]) {
|
||||||
force_split[0] = 1;
|
force_split[i + 1] = 1;
|
||||||
|
force_split[0] = 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!force_split[0])
|
if (!force_split[0]) {
|
||||||
fill_variance_tree(&vt, BLOCK_64X64);
|
fill_variance_tree(&vt, BLOCK_64X64);
|
||||||
|
get_variance(&vt.part_variances.none);
|
||||||
|
}
|
||||||
|
|
||||||
// Now go through the entire structure, splitting every block size until
|
// Now go through the entire structure, splitting every block size until
|
||||||
// we get to one that's got a variance lower than our threshold.
|
// we get to one that's got a variance lower than our threshold.
|
||||||
if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
|
if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
|
||||||
!set_vt_partitioning(cpi, xd, &vt, BLOCK_64X64, mi_row, mi_col,
|
!set_vt_partitioning(cpi, xd, &vt, BLOCK_64X64, mi_row, mi_col,
|
||||||
@@ -805,7 +895,9 @@ static void choose_partitioning(VP9_COMP *cpi,
|
|||||||
if (!set_vt_partitioning(cpi, xd, vtemp, BLOCK_16X16,
|
if (!set_vt_partitioning(cpi, xd, vtemp, BLOCK_16X16,
|
||||||
mi_row + y32_idx + y16_idx,
|
mi_row + y32_idx + y16_idx,
|
||||||
mi_col + x32_idx + x16_idx,
|
mi_col + x32_idx + x16_idx,
|
||||||
thresholds[2], cpi->vbp_bsize_min, 0)) {
|
thresholds[2],
|
||||||
|
cpi->vbp_bsize_min,
|
||||||
|
force_split[5 + i2 + j])) {
|
||||||
for (k = 0; k < 4; ++k) {
|
for (k = 0; k < 4; ++k) {
|
||||||
const int x8_idx = (k & 1);
|
const int x8_idx = (k & 1);
|
||||||
const int y8_idx = (k >> 1);
|
const int y8_idx = (k >> 1);
|
||||||
@@ -832,6 +924,7 @@ static void choose_partitioning(VP9_COMP *cpi,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void update_state(VP9_COMP *cpi, ThreadData *td,
|
static void update_state(VP9_COMP *cpi, ThreadData *td,
|
||||||
|
|||||||
@@ -463,6 +463,8 @@ typedef struct VP9_COMP {
|
|||||||
// 0 - threshold_64x64; 1 - threshold_32x32;
|
// 0 - threshold_64x64; 1 - threshold_32x32;
|
||||||
// 2 - threshold_16x16; 3 - vbp_threshold_8x8;
|
// 2 - threshold_16x16; 3 - vbp_threshold_8x8;
|
||||||
int64_t vbp_thresholds[4];
|
int64_t vbp_thresholds[4];
|
||||||
|
int64_t vbp_threshold_minmax;
|
||||||
|
int64_t vbp_threshold_sad;
|
||||||
BLOCK_SIZE vbp_bsize_min;
|
BLOCK_SIZE vbp_bsize_min;
|
||||||
|
|
||||||
// Multi-threading
|
// Multi-threading
|
||||||
|
|||||||
@@ -11,6 +11,83 @@
|
|||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
#include "vpx_ports/mem.h"
|
#include "vpx_ports/mem.h"
|
||||||
|
|
||||||
|
void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
|
||||||
|
int *min, int *max) {
|
||||||
|
__m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
|
||||||
|
u0 = _mm_setzero_si128();
|
||||||
|
// Row 0
|
||||||
|
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
|
||||||
|
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
|
||||||
|
diff = _mm_subs_epi16(s0, d0);
|
||||||
|
negdiff = _mm_subs_epi16(u0, diff);
|
||||||
|
absdiff0 = _mm_max_epi16(diff, negdiff);
|
||||||
|
// Row 1
|
||||||
|
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
|
||||||
|
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
|
||||||
|
diff = _mm_subs_epi16(s0, d0);
|
||||||
|
negdiff = _mm_subs_epi16(u0, diff);
|
||||||
|
absdiff = _mm_max_epi16(diff, negdiff);
|
||||||
|
maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
|
||||||
|
minabsdiff = _mm_min_epi16(absdiff0, absdiff);
|
||||||
|
// Row 2
|
||||||
|
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
|
||||||
|
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
|
||||||
|
diff = _mm_subs_epi16(s0, d0);
|
||||||
|
negdiff = _mm_subs_epi16(u0, diff);
|
||||||
|
absdiff = _mm_max_epi16(diff, negdiff);
|
||||||
|
maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
|
||||||
|
minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
|
||||||
|
// Row 3
|
||||||
|
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
|
||||||
|
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
|
||||||
|
diff = _mm_subs_epi16(s0, d0);
|
||||||
|
negdiff = _mm_subs_epi16(u0, diff);
|
||||||
|
absdiff = _mm_max_epi16(diff, negdiff);
|
||||||
|
maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
|
||||||
|
minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
|
||||||
|
// Row 4
|
||||||
|
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
|
||||||
|
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
|
||||||
|
diff = _mm_subs_epi16(s0, d0);
|
||||||
|
negdiff = _mm_subs_epi16(u0, diff);
|
||||||
|
absdiff = _mm_max_epi16(diff, negdiff);
|
||||||
|
maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
|
||||||
|
minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
|
||||||
|
// Row 5
|
||||||
|
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
|
||||||
|
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
|
||||||
|
diff = _mm_subs_epi16(s0, d0);
|
||||||
|
negdiff = _mm_subs_epi16(u0, diff);
|
||||||
|
absdiff = _mm_max_epi16(diff, negdiff);
|
||||||
|
maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
|
||||||
|
minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
|
||||||
|
// Row 6
|
||||||
|
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
|
||||||
|
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
|
||||||
|
diff = _mm_subs_epi16(s0, d0);
|
||||||
|
negdiff = _mm_subs_epi16(u0, diff);
|
||||||
|
absdiff = _mm_max_epi16(diff, negdiff);
|
||||||
|
maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
|
||||||
|
minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
|
||||||
|
// Row 7
|
||||||
|
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
|
||||||
|
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
|
||||||
|
diff = _mm_subs_epi16(s0, d0);
|
||||||
|
negdiff = _mm_subs_epi16(u0, diff);
|
||||||
|
absdiff = _mm_max_epi16(diff, negdiff);
|
||||||
|
maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
|
||||||
|
minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
|
||||||
|
|
||||||
|
maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
|
||||||
|
maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
|
||||||
|
maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
|
||||||
|
*max = _mm_extract_epi16(maxabsdiff, 0);
|
||||||
|
|
||||||
|
minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
|
||||||
|
minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
|
||||||
|
minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
|
||||||
|
*min = _mm_extract_epi16(minabsdiff, 0);
|
||||||
|
}
|
||||||
|
|
||||||
unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {
|
unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {
|
||||||
__m128i s0, s1, u0;
|
__m128i s0, s1, u0;
|
||||||
|
|||||||
Reference in New Issue
Block a user