Merge "Adds high bitdepth convolve, interpred & scaling"

This commit is contained in:
Deb Mukherjee 2014-09-18 10:52:23 -07:00 committed by Gerrit Code Review
commit 6d0ee9860e
15 changed files with 3240 additions and 96 deletions

File diff suppressed because it is too large Load Diff

View File

@ -282,3 +282,280 @@ void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
dst += dst_stride;
}
}
#if CONFIG_VP9_HIGHBITDEPTH
static void high_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
const InterpKernel *x_filters,
int x0_q4, int x_step_q4,
int w, int h, int bd) {
int x, y;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
src -= SUBPEL_TAPS / 2 - 1;
for (y = 0; y < h; ++y) {
int x_q4 = x0_q4;
for (x = 0; x < w; ++x) {
const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
int k, sum = 0;
for (k = 0; k < SUBPEL_TAPS; ++k)
sum += src_x[k] * x_filter[k];
dst[x] = clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
x_q4 += x_step_q4;
}
src += src_stride;
dst += dst_stride;
}
}
static void high_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
const InterpKernel *x_filters,
int x0_q4, int x_step_q4,
int w, int h, int bd) {
int x, y;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
src -= SUBPEL_TAPS / 2 - 1;
for (y = 0; y < h; ++y) {
int x_q4 = x0_q4;
for (x = 0; x < w; ++x) {
const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
int k, sum = 0;
for (k = 0; k < SUBPEL_TAPS; ++k)
sum += src_x[k] * x_filter[k];
dst[x] = ROUND_POWER_OF_TWO(dst[x] +
clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
x_q4 += x_step_q4;
}
src += src_stride;
dst += dst_stride;
}
}
static void high_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
const InterpKernel *y_filters,
int y0_q4, int y_step_q4, int w, int h,
int bd) {
int x, y;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
src -= src_stride * (SUBPEL_TAPS / 2 - 1);
for (x = 0; x < w; ++x) {
int y_q4 = y0_q4;
for (y = 0; y < h; ++y) {
const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
int k, sum = 0;
for (k = 0; k < SUBPEL_TAPS; ++k)
sum += src_y[k * src_stride] * y_filter[k];
dst[y * dst_stride] = clip_pixel_high(
ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
y_q4 += y_step_q4;
}
++src;
++dst;
}
}
static void high_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
const InterpKernel *y_filters,
int y0_q4, int y_step_q4, int w, int h,
int bd) {
int x, y;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
src -= src_stride * (SUBPEL_TAPS / 2 - 1);
for (x = 0; x < w; ++x) {
int y_q4 = y0_q4;
for (y = 0; y < h; ++y) {
const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
int k, sum = 0;
for (k = 0; k < SUBPEL_TAPS; ++k)
sum += src_y[k * src_stride] * y_filter[k];
dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
y_q4 += y_step_q4;
}
++src;
++dst;
}
}
static void high_convolve(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *const x_filters,
int x0_q4, int x_step_q4,
const InterpKernel *const y_filters,
int y0_q4, int y_step_q4,
int w, int h, int bd) {
// Note: Fixed size intermediate buffer, temp, places limits on parameters.
// 2d filtering proceeds in 2 steps:
// (1) Interpolate horizontally into an intermediate buffer, temp.
// (2) Interpolate temp vertically to derive the sub-pixel result.
// Deriving the maximum number of rows in the temp buffer (135):
// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
// --Largest block size is 64x64 pixels.
// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
// original frame (in 1/16th pixel units).
// --Must round-up because block may be located at sub-pixel position.
// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
uint16_t temp[64 * 135];
int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + SUBPEL_TAPS;
assert(w <= 64);
assert(h <= 64);
assert(y_step_q4 <= 32);
assert(x_step_q4 <= 32);
if (intermediate_height < h)
intermediate_height = h;
high_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
src_stride, CONVERT_TO_BYTEPTR(temp), 64,
x_filters, x0_q4, x_step_q4, w,
intermediate_height, bd);
high_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
64, dst, dst_stride, y_filters, y0_q4, y_step_q4,
w, h, bd);
}
void vp9_high_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
(void)filter_y;
(void)y_step_q4;
high_convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
x0_q4, x_step_q4, w, h, bd);
}
void vp9_high_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
(void)filter_y;
(void)y_step_q4;
high_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
x0_q4, x_step_q4, w, h, bd);
}
void vp9_high_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
const InterpKernel *const filters_y = get_filter_base(filter_y);
const int y0_q4 = get_filter_offset(filter_y, filters_y);
(void)filter_x;
(void)x_step_q4;
high_convolve_vert(src, src_stride, dst, dst_stride, filters_y,
y0_q4, y_step_q4, w, h, bd);
}
void vp9_high_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
const InterpKernel *const filters_y = get_filter_base(filter_y);
const int y0_q4 = get_filter_offset(filter_y, filters_y);
(void)filter_x;
(void)x_step_q4;
high_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
y0_q4, y_step_q4, w, h, bd);
}
void vp9_high_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
const InterpKernel *const filters_y = get_filter_base(filter_y);
const int y0_q4 = get_filter_offset(filter_y, filters_y);
high_convolve(src, src_stride, dst, dst_stride,
filters_x, x0_q4, x_step_q4,
filters_y, y0_q4, y_step_q4, w, h, bd);
}
void vp9_high_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
// Fixed size intermediate buffer places limits on parameters.
DECLARE_ALIGNED_ARRAY(16, uint16_t, temp, 64 * 64);
assert(w <= 64);
assert(h <= 64);
vp9_high_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
vp9_high_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
NULL, 0, NULL, 0, w, h, bd);
}
void vp9_high_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h, int bd) {
int r;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
(void)filter_x;
(void)filter_y;
(void)filter_x_stride;
(void)filter_y_stride;
(void)bd;
for (r = h; r > 0; --r) {
vpx_memcpy(dst, src, w * sizeof(uint16_t));
src += src_stride;
dst += dst_stride;
}
}
void vp9_high_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h, int bd) {
int x, y;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
(void)filter_x;
(void)filter_y;
(void)filter_x_stride;
(void)filter_y_stride;
(void)bd;
for (y = 0; y < h; ++y) {
for (x = 0; x < w; ++x) {
dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
}
src += src_stride;
dst += dst_stride;
}
}
#endif

View File

@ -23,6 +23,14 @@ typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_y, int y_step_q4,
int w, int h);
#if CONFIG_VP9_HIGHBITDEPTH
typedef void (*high_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd);
#endif
#ifdef __cplusplus
} // extern "C"
#endif

View File

@ -63,6 +63,53 @@ static void build_mc_border(const uint8_t *src, int src_stride,
} while (--b_h);
}
#if CONFIG_VP9_HIGHBITDEPTH
static void high_build_mc_border(const uint8_t *src8, int src_stride,
uint16_t *dst, int dst_stride,
int x, int y, int b_w, int b_h,
int w, int h) {
// Get a pointer to the start of the real data for this row.
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
const uint16_t *ref_row = src - x - y * src_stride;
if (y >= h)
ref_row += (h - 1) * src_stride;
else if (y > 0)
ref_row += y * src_stride;
do {
int right = 0, copy;
int left = x < 0 ? -x : 0;
if (left > b_w)
left = b_w;
if (x + b_w > w)
right = x + b_w - w;
if (right > b_w)
right = b_w;
copy = b_w - left - right;
if (left)
vpx_memset16(dst, ref_row[0], left);
if (copy)
memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
if (right)
vpx_memset16(dst + left + copy, ref_row[w - 1], right);
dst += dst_stride;
++y;
if (y > 0 && y < h)
ref_row += src_stride;
} while (--b_h);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
static void inter_predictor(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const int subpel_x,
@ -97,6 +144,42 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4);
}
#if CONFIG_VP9_HIGHBITDEPTH
static void high_inter_predictor(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const int subpel_x,
const int subpel_y,
const struct scale_factors *sf,
int w, int h, int ref,
const InterpKernel *kernel,
int xs, int ys, int bd) {
sf->high_predict[subpel_x != 0][subpel_y != 0][ref](
src, src_stride, dst, dst_stride,
kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
}
void vp9_high_build_inter_predictor(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const MV *src_mv,
const struct scale_factors *sf,
int w, int h, int ref,
const InterpKernel *kernel,
enum mv_precision precision,
int x, int y, int bd) {
const int is_q4 = precision == MV_PRECISION_Q4;
const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
is_q4 ? src_mv->col : src_mv->col * 2 };
MV32 mv = vp9_scale_mv(&mv_q4, x, y, sf);
const int subpel_x = mv.col & SUBPEL_MASK;
const int subpel_y = mv.row & SUBPEL_MASK;
src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
high_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4, bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
static INLINE int round_mv_comp_q4(int value) {
return (value < 0 ? value - 2 : value + 2) / 4;
}
@ -222,8 +305,19 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride
+ (scaled_mv.col >> SUBPEL_BITS);
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
high_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys,
xd->bd);
} else {
inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys);
}
#else
inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys);
#endif // CONFIG_VP9_HIGHBITDEPTH
}
}
@ -393,16 +487,64 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0;
// Extend the border.
build_mc_border(buf_ptr1, pre_buf->stride, xd->mc_buf, x1 - x0 + 1,
x0, y0, x1 - x0 + 1, y1 - y0 + 1, frame_width,
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
high_build_mc_border(buf_ptr1,
pre_buf->stride,
xd->mc_buf_high,
x1 - x0 + 1,
x0,
y0,
x1 - x0 + 1,
y1 - y0 + 1,
frame_width,
frame_height);
buf_stride = x1 - x0 + 1;
buf_ptr = CONVERT_TO_BYTEPTR(xd->mc_buf_high) +
y_pad * 3 * buf_stride + x_pad * 3;
} else {
build_mc_border(buf_ptr1,
pre_buf->stride,
xd->mc_buf,
x1 - x0 + 1,
x0,
y0,
x1 - x0 + 1,
y1 - y0 + 1,
frame_width,
frame_height);
buf_stride = x1 - x0 + 1;
buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
}
#else
build_mc_border(buf_ptr1,
pre_buf->stride,
xd->mc_buf,
x1 - x0 + 1,
x0,
y0,
x1 - x0 + 1,
y1 - y0 + 1,
frame_width,
frame_height);
buf_stride = x1 - x0 + 1;
buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
#endif // CONFIG_VP9_HIGHBITDEPTH
}
}
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
} else {
inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
subpel_y, sf, w, h, ref, kernel, xs, ys);
}
#else
inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
subpel_y, sf, w, h, ref, kernel, xs, ys);
#endif // CONFIG_VP9_HIGHBITDEPTH
}
}

View File

@ -39,6 +39,17 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
enum mv_precision precision,
int x, int y);
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_high_build_inter_predictor(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const MV *mv_q3,
const struct scale_factors *sf,
int w, int h, int do_avg,
const InterpKernel *kernel,
enum mv_precision precision,
int x, int y, int bd);
#endif
static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
const struct scale_factors *sf) {
const int x = sf ? sf->scale_value_x(x_offset, sf) : x_offset;

View File

@ -606,6 +606,33 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp9_high_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
specialize qw/vp9_high_dc_128_predictor_32x32/;
#
# Sub Pixel Filters
#
add_proto qw/void vp9_high_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/vp9_high_convolve_copy/;
add_proto qw/void vp9_high_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/vp9_high_convolve_avg/;
add_proto qw/void vp9_high_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/vp9_high_convolve8/, "$sse2_x86_64";
add_proto qw/void vp9_high_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/vp9_high_convolve8_horiz/, "$sse2_x86_64";
add_proto qw/void vp9_high_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/vp9_high_convolve8_vert/, "$sse2_x86_64";
add_proto qw/void vp9_high_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/vp9_high_convolve8_avg/, "$sse2_x86_64";
add_proto qw/void vp9_high_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/vp9_high_convolve8_avg_horiz/, "$sse2_x86_64";
add_proto qw/void vp9_high_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/vp9_high_convolve8_avg_vert/, "$sse2_x86_64";
#
# dct
#

View File

@ -43,9 +43,16 @@ MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf) {
return res;
}
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
int other_w, int other_h,
int this_w, int this_h,
int use_high) {
#else
void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
int other_w, int other_h,
int this_w, int this_h) {
#endif
if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
sf->x_scale_fp = REF_INVALID_SCALE;
sf->y_scale_fp = REF_INVALID_SCALE;
@ -111,4 +118,48 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
// 2D subpel motion always gets filtered in both directions
sf->predict[1][1][0] = vp9_convolve8;
sf->predict[1][1][1] = vp9_convolve8_avg;
#if CONFIG_VP9_HIGHBITDEPTH
if (use_high) {
if (sf->x_step_q4 == 16) {
if (sf->y_step_q4 == 16) {
// No scaling in either direction.
sf->high_predict[0][0][0] = vp9_high_convolve_copy;
sf->high_predict[0][0][1] = vp9_high_convolve_avg;
sf->high_predict[0][1][0] = vp9_high_convolve8_vert;
sf->high_predict[0][1][1] = vp9_high_convolve8_avg_vert;
sf->high_predict[1][0][0] = vp9_high_convolve8_horiz;
sf->high_predict[1][0][1] = vp9_high_convolve8_avg_horiz;
} else {
// No scaling in x direction. Must always scale in the y direction.
sf->high_predict[0][0][0] = vp9_high_convolve8_vert;
sf->high_predict[0][0][1] = vp9_high_convolve8_avg_vert;
sf->high_predict[0][1][0] = vp9_high_convolve8_vert;
sf->high_predict[0][1][1] = vp9_high_convolve8_avg_vert;
sf->high_predict[1][0][0] = vp9_high_convolve8;
sf->high_predict[1][0][1] = vp9_high_convolve8_avg;
}
} else {
if (sf->y_step_q4 == 16) {
// No scaling in the y direction. Must always scale in the x direction.
sf->high_predict[0][0][0] = vp9_high_convolve8_horiz;
sf->high_predict[0][0][1] = vp9_high_convolve8_avg_horiz;
sf->high_predict[0][1][0] = vp9_high_convolve8;
sf->high_predict[0][1][1] = vp9_high_convolve8_avg;
sf->high_predict[1][0][0] = vp9_high_convolve8_horiz;
sf->high_predict[1][0][1] = vp9_high_convolve8_avg_horiz;
} else {
// Must always scale in both directions.
sf->high_predict[0][0][0] = vp9_high_convolve8;
sf->high_predict[0][0][1] = vp9_high_convolve8_avg;
sf->high_predict[0][1][0] = vp9_high_convolve8;
sf->high_predict[0][1][1] = vp9_high_convolve8_avg;
sf->high_predict[1][0][0] = vp9_high_convolve8;
sf->high_predict[1][0][1] = vp9_high_convolve8_avg;
}
}
// 2D subpel motion always gets filtered in both directions.
sf->high_predict[1][1][0] = vp9_high_convolve8;
sf->high_predict[1][1][1] = vp9_high_convolve8_avg;
}
#endif
}

View File

@ -32,13 +32,23 @@ struct scale_factors {
int (*scale_value_y)(int val, const struct scale_factors *sf);
convolve_fn_t predict[2][2][2]; // horiz, vert, avg
#if CONFIG_VP9_HIGHBITDEPTH
high_convolve_fn_t high_predict[2][2][2]; // horiz, vert, avg
#endif
};
MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
int other_w, int other_h,
int this_w, int this_h,
int use_high);
#else
void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
int other_w, int other_h,
int this_w, int this_h);
#endif
static INLINE int vp9_is_valid_scale(const struct scale_factors *sf) {
return sf->x_scale_fp != REF_INVALID_SCALE &&

View File

@ -139,6 +139,153 @@ void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
} \
}
#if CONFIG_VP9_HIGHBITDEPTH
typedef void high_filter8_1dfunction (
const uint16_t *src_ptr,
const ptrdiff_t src_pitch,
uint16_t *output_ptr,
ptrdiff_t out_pitch,
unsigned int output_height,
const int16_t *filter,
int bd
);
#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
void vp9_high_convolve8_##name##_##opt(const uint8_t *src8, \
ptrdiff_t src_stride, \
uint8_t *dst8, ptrdiff_t dst_stride, \
const int16_t *filter_x, \
int x_step_q4, \
const int16_t *filter_y, \
int y_step_q4, \
int w, int h, int bd) { \
if (step_q4 == 16 && filter[3] != 128) { \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
if (filter[0] || filter[1] || filter[2]) { \
while (w >= 16) { \
vp9_high_filter_block1d16_##dir##8_##avg##opt(src_start, \
src_stride, \
dst, \
dst_stride, \
h, \
filter, \
bd); \
src += 16; \
dst += 16; \
w -= 16; \
} \
while (w >= 8) { \
vp9_high_filter_block1d8_##dir##8_##avg##opt(src_start, \
src_stride, \
dst, \
dst_stride, \
h, \
filter, \
bd); \
src += 8; \
dst += 8; \
w -= 8; \
} \
while (w >= 4) { \
vp9_high_filter_block1d4_##dir##8_##avg##opt(src_start, \
src_stride, \
dst, \
dst_stride, \
h, \
filter, \
bd); \
src += 4; \
dst += 4; \
w -= 4; \
} \
} else { \
while (w >= 16) { \
vp9_high_filter_block1d16_##dir##2_##avg##opt(src, \
src_stride, \
dst, \
dst_stride, \
h, \
filter, \
bd); \
src += 16; \
dst += 16; \
w -= 16; \
} \
while (w >= 8) { \
vp9_high_filter_block1d8_##dir##2_##avg##opt(src, \
src_stride, \
dst, \
dst_stride, \
h, \
filter, \
bd); \
src += 8; \
dst += 8; \
w -= 8; \
} \
while (w >= 4) { \
vp9_high_filter_block1d4_##dir##2_##avg##opt(src, \
src_stride, \
dst, \
dst_stride, \
h, \
filter, \
bd); \
src += 4; \
dst += 4; \
w -= 4; \
} \
} \
} \
if (w) { \
vp9_high_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
filter_x, x_step_q4, filter_y, y_step_q4, \
w, h, bd); \
} \
}
#define HIGH_FUN_CONV_2D(avg, opt) \
void vp9_high_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
uint8_t *dst, ptrdiff_t dst_stride, \
const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, \
int w, int h, int bd) { \
assert(w <= 64); \
assert(h <= 64); \
if (x_step_q4 == 16 && y_step_q4 == 16) { \
if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
DECLARE_ALIGNED_ARRAY(16, uint16_t, fdata2, 64 * 71); \
vp9_high_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
CONVERT_TO_BYTEPTR(fdata2), 64, \
filter_x, x_step_q4, filter_y, y_step_q4, \
w, h + 7, bd); \
vp9_high_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
64, dst, dst_stride, \
filter_x, x_step_q4, filter_y, \
y_step_q4, w, h, bd); \
} else { \
DECLARE_ALIGNED_ARRAY(16, uint16_t, fdata2, 64 * 65); \
vp9_high_convolve8_horiz_##opt(src, src_stride, \
CONVERT_TO_BYTEPTR(fdata2), 64, \
filter_x, x_step_q4, filter_y, y_step_q4, \
w, h + 1, bd); \
vp9_high_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
dst, dst_stride, \
filter_x, x_step_q4, filter_y, \
y_step_q4, w, h, bd); \
} \
} else { \
vp9_high_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
filter_x, x_step_q4, filter_y, y_step_q4, w, \
h, bd); \
} \
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#if HAVE_AVX2 && HAVE_SSSE3
filter8_1dfunction vp9_filter_block1d16_v8_avx2;
filter8_1dfunction vp9_filter_block1d16_h8_avx2;
@ -336,4 +483,75 @@ FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
// int w, int h);
FUN_CONV_2D(, sse2);
FUN_CONV_2D(avg_ , sse2);
#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
high_filter8_1dfunction vp9_high_filter_block1d16_v8_sse2;
high_filter8_1dfunction vp9_high_filter_block1d16_h8_sse2;
high_filter8_1dfunction vp9_high_filter_block1d8_v8_sse2;
high_filter8_1dfunction vp9_high_filter_block1d8_h8_sse2;
high_filter8_1dfunction vp9_high_filter_block1d4_v8_sse2;
high_filter8_1dfunction vp9_high_filter_block1d4_h8_sse2;
high_filter8_1dfunction vp9_high_filter_block1d16_v8_avg_sse2;
high_filter8_1dfunction vp9_high_filter_block1d16_h8_avg_sse2;
high_filter8_1dfunction vp9_high_filter_block1d8_v8_avg_sse2;
high_filter8_1dfunction vp9_high_filter_block1d8_h8_avg_sse2;
high_filter8_1dfunction vp9_high_filter_block1d4_v8_avg_sse2;
high_filter8_1dfunction vp9_high_filter_block1d4_h8_avg_sse2;
high_filter8_1dfunction vp9_high_filter_block1d16_v2_sse2;
high_filter8_1dfunction vp9_high_filter_block1d16_h2_sse2;
high_filter8_1dfunction vp9_high_filter_block1d8_v2_sse2;
high_filter8_1dfunction vp9_high_filter_block1d8_h2_sse2;
high_filter8_1dfunction vp9_high_filter_block1d4_v2_sse2;
high_filter8_1dfunction vp9_high_filter_block1d4_h2_sse2;
high_filter8_1dfunction vp9_high_filter_block1d16_v2_avg_sse2;
high_filter8_1dfunction vp9_high_filter_block1d16_h2_avg_sse2;
high_filter8_1dfunction vp9_high_filter_block1d8_v2_avg_sse2;
high_filter8_1dfunction vp9_high_filter_block1d8_h2_avg_sse2;
high_filter8_1dfunction vp9_high_filter_block1d4_v2_avg_sse2;
high_filter8_1dfunction vp9_high_filter_block1d4_h2_avg_sse2;
// void vp9_high_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h, int bd);
// void vp9_high_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h, int bd);
// void vp9_high_convolve8_avg_horiz_sse2(const uint8_t *src,
// ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x,
// int x_step_q4,
// const int16_t *filter_y,
// int y_step_q4,
// int w, int h, int bd);
// void vp9_high_convolve8_avg_vert_sse2(const uint8_t *src,
// ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h, int bd);
HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
sse2);
// void vp9_high_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h, int bd);
// void vp9_high_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h, int bd);
HIGH_FUN_CONV_2D(, sse2);
HIGH_FUN_CONV_2D(avg_ , sse2);
#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
#endif // HAVE_SSE2

View File

@ -0,0 +1,962 @@
;
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;Note: tap3 and tap4 have to be applied and added after other taps to avoid
;overflow.
%macro HIGH_GET_FILTERS_4 0
mov rdx, arg(5) ;filter ptr
mov rcx, 0x00000040
movdqa xmm7, [rdx] ;load filters
pshuflw xmm0, xmm7, 0b ;k0
pshuflw xmm1, xmm7, 01010101b ;k1
pshuflw xmm2, xmm7, 10101010b ;k2
pshuflw xmm3, xmm7, 11111111b ;k3
psrldq xmm7, 8
pshuflw xmm4, xmm7, 0b ;k4
pshuflw xmm5, xmm7, 01010101b ;k5
pshuflw xmm6, xmm7, 10101010b ;k6
pshuflw xmm7, xmm7, 11111111b ;k7
punpcklwd xmm0, xmm6
punpcklwd xmm2, xmm5
punpcklwd xmm3, xmm4
punpcklwd xmm1, xmm7
movdqa k0k6, xmm0
movdqa k2k5, xmm2
movdqa k3k4, xmm3
movdqa k1k7, xmm1
movq xmm6, rcx
pshufd xmm6, xmm6, 0
movdqa krd, xmm6
;Compute max and min values of a pixel
mov rdx, 0x00010001
movsxd rcx, DWORD PTR arg(6) ;bps
movq xmm0, rdx
movq xmm1, rcx
pshufd xmm0, xmm0, 0b
movdqa xmm2, xmm0
psllw xmm0, xmm1
psubw xmm0, xmm2
pxor xmm1, xmm1
movdqa max, xmm0 ;max value (for clamping)
movdqa min, xmm1 ;min value (for clamping)
%endm
%macro HIGH_APPLY_FILTER_4 1
punpcklwd xmm0, xmm6 ;two row in one register
punpcklwd xmm1, xmm7
punpcklwd xmm2, xmm5
punpcklwd xmm3, xmm4
pmaddwd xmm0, k0k6 ;multiply the filter factors
pmaddwd xmm1, k1k7
pmaddwd xmm2, k2k5
pmaddwd xmm3, k3k4
paddd xmm0, xmm1 ;sum
paddd xmm0, xmm2
paddd xmm0, xmm3
paddd xmm0, krd ;rounding
psrad xmm0, 7 ;shift
packssdw xmm0, xmm0 ;pack to word
;clamp the values
pminsw xmm0, max
pmaxsw xmm0, min
%if %1
movq xmm1, [rdi]
pavgw xmm0, xmm1
%endif
movq [rdi], xmm0
%endm
%macro HIGH_GET_FILTERS 0
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
mov rcx, 0x00000040
movdqa xmm7, [rdx] ;load filters
pshuflw xmm0, xmm7, 0b ;k0
pshuflw xmm1, xmm7, 01010101b ;k1
pshuflw xmm2, xmm7, 10101010b ;k2
pshuflw xmm3, xmm7, 11111111b ;k3
pshufhw xmm4, xmm7, 0b ;k4
pshufhw xmm5, xmm7, 01010101b ;k5
pshufhw xmm6, xmm7, 10101010b ;k6
pshufhw xmm7, xmm7, 11111111b ;k7
punpcklqdq xmm2, xmm2
punpcklqdq xmm3, xmm3
punpcklwd xmm0, xmm1
punpckhwd xmm6, xmm7
punpckhwd xmm2, xmm5
punpckhwd xmm3, xmm4
movdqa k0k1, xmm0 ;store filter factors on stack
movdqa k6k7, xmm6
movdqa k2k5, xmm2
movdqa k3k4, xmm3
movq xmm6, rcx
pshufd xmm6, xmm6, 0
movdqa krd, xmm6 ;rounding
;Compute max and min values of a pixel
mov rdx, 0x00010001
movsxd rcx, DWORD PTR arg(6) ;bps
movq xmm0, rdx
movq xmm1, rcx
pshufd xmm0, xmm0, 0b
movdqa xmm2, xmm0
psllw xmm0, xmm1
psubw xmm0, xmm2
pxor xmm1, xmm1
movdqa max, xmm0 ;max value (for clamping)
movdqa min, xmm1 ;min value (for clamping)
%endm
%macro LOAD_VERT_8 1
movdqu xmm0, [rsi + %1] ;0
movdqu xmm1, [rsi + rax + %1] ;1
movdqu xmm6, [rsi + rdx * 2 + %1] ;6
lea rsi, [rsi + rax]
movdqu xmm7, [rsi + rdx * 2 + %1] ;7
movdqu xmm2, [rsi + rax + %1] ;2
movdqu xmm3, [rsi + rax * 2 + %1] ;3
movdqu xmm4, [rsi + rdx + %1] ;4
movdqu xmm5, [rsi + rax * 4 + %1] ;5
%endm
%macro HIGH_APPLY_FILTER_8 2
movdqu temp, xmm4
movdqa xmm4, xmm0
punpcklwd xmm0, xmm1
punpckhwd xmm4, xmm1
movdqa xmm1, xmm6
punpcklwd xmm6, xmm7
punpckhwd xmm1, xmm7
movdqa xmm7, xmm2
punpcklwd xmm2, xmm5
punpckhwd xmm7, xmm5
movdqu xmm5, temp
movdqu temp, xmm4
movdqa xmm4, xmm3
punpcklwd xmm3, xmm5
punpckhwd xmm4, xmm5
movdqu xmm5, temp
pmaddwd xmm0, k0k1
pmaddwd xmm5, k0k1
pmaddwd xmm6, k6k7
pmaddwd xmm1, k6k7
pmaddwd xmm2, k2k5
pmaddwd xmm7, k2k5
pmaddwd xmm3, k3k4
pmaddwd xmm4, k3k4
paddd xmm0, xmm6
paddd xmm0, xmm2
paddd xmm0, xmm3
paddd xmm5, xmm1
paddd xmm5, xmm7
paddd xmm5, xmm4
paddd xmm0, krd ;rounding
paddd xmm5, krd
psrad xmm0, 7 ;shift
psrad xmm5, 7
packssdw xmm0, xmm5 ;pack back to word
;clamp the values
pminsw xmm0, max
pmaxsw xmm0, min
%if %1
movdqu xmm1, [rdi + %2]
pavgw xmm0, xmm1
%endif
movdqu [rdi + %2], xmm0
%endm
;void vp9_filter_block1d4_v8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned char *output_ptr,
; unsigned int out_pitch,
; unsigned int output_height,
; short *filter
;)
global sym(vp9_high_filter_block1d4_v8_sse2) PRIVATE
sym(vp9_high_filter_block1d4_v8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 7
%define k0k6 [rsp + 16 * 0]
%define k2k5 [rsp + 16 * 1]
%define k3k4 [rsp + 16 * 2]
%define k1k7 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define max [rsp + 16 * 5]
%define min [rsp + 16 * 6]
HIGH_GET_FILTERS_4
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
movsxd rbx, DWORD PTR arg(3) ;out_pitch
lea rax, [rax + rax] ;bytes per line
lea rbx, [rbx + rbx]
lea rdx, [rax + rax * 2]
movsxd rcx, DWORD PTR arg(4) ;output_height
.loop:
movq xmm0, [rsi] ;load src: row 0
movq xmm1, [rsi + rax] ;1
movq xmm6, [rsi + rdx * 2] ;6
lea rsi, [rsi + rax]
movq xmm7, [rsi + rdx * 2] ;7
movq xmm2, [rsi + rax] ;2
movq xmm3, [rsi + rax * 2] ;3
movq xmm4, [rsi + rdx] ;4
movq xmm5, [rsi + rax * 4] ;5
HIGH_APPLY_FILTER_4 0
lea rdi, [rdi + rbx]
dec rcx
jnz .loop
add rsp, 16 * 7
pop rsp
pop rbx
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp9_filter_block1d8_v8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned char *output_ptr,
; unsigned int out_pitch,
; unsigned int output_height,
; short *filter
;)
global sym(vp9_high_filter_block1d8_v8_sse2) PRIVATE
sym(vp9_high_filter_block1d8_v8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 8
%define k0k1 [rsp + 16 * 0]
%define k6k7 [rsp + 16 * 1]
%define k2k5 [rsp + 16 * 2]
%define k3k4 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define temp [rsp + 16 * 5]
%define max [rsp + 16 * 6]
%define min [rsp + 16 * 7]
HIGH_GET_FILTERS
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
movsxd rbx, DWORD PTR arg(3) ;out_pitch
lea rax, [rax + rax] ;bytes per line
lea rbx, [rbx + rbx]
lea rdx, [rax + rax * 2]
movsxd rcx, DWORD PTR arg(4) ;output_height
.loop:
LOAD_VERT_8 0
HIGH_APPLY_FILTER_8 0, 0
lea rdi, [rdi + rbx]
dec rcx
jnz .loop
add rsp, 16 * 8
pop rsp
pop rbx
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp9_filter_block1d16_v8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned char *output_ptr,
; unsigned int out_pitch,
; unsigned int output_height,
; short *filter
;)
global sym(vp9_high_filter_block1d16_v8_sse2) PRIVATE
sym(vp9_high_filter_block1d16_v8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 8
%define k0k1 [rsp + 16 * 0]
%define k6k7 [rsp + 16 * 1]
%define k2k5 [rsp + 16 * 2]
%define k3k4 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define temp [rsp + 16 * 5]
%define max [rsp + 16 * 6]
%define min [rsp + 16 * 7]
HIGH_GET_FILTERS
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
movsxd rbx, DWORD PTR arg(3) ;out_pitch
lea rax, [rax + rax] ;bytes per line
lea rbx, [rbx + rbx]
lea rdx, [rax + rax * 2]
movsxd rcx, DWORD PTR arg(4) ;output_height
.loop:
LOAD_VERT_8 0
HIGH_APPLY_FILTER_8 0, 0
sub rsi, rax
LOAD_VERT_8 16
HIGH_APPLY_FILTER_8 0, 16
add rdi, rbx
dec rcx
jnz .loop
add rsp, 16 * 8
pop rsp
pop rbx
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
global sym(vp9_high_filter_block1d4_v8_avg_sse2) PRIVATE
sym(vp9_high_filter_block1d4_v8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 7
%define k0k6 [rsp + 16 * 0]
%define k2k5 [rsp + 16 * 1]
%define k3k4 [rsp + 16 * 2]
%define k1k7 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define max [rsp + 16 * 5]
%define min [rsp + 16 * 6]
HIGH_GET_FILTERS_4
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
movsxd rbx, DWORD PTR arg(3) ;out_pitch
lea rax, [rax + rax] ;bytes per line
lea rbx, [rbx + rbx]
lea rdx, [rax + rax * 2]
movsxd rcx, DWORD PTR arg(4) ;output_height
.loop:
movq xmm0, [rsi] ;load src: row 0
movq xmm1, [rsi + rax] ;1
movq xmm6, [rsi + rdx * 2] ;6
lea rsi, [rsi + rax]
movq xmm7, [rsi + rdx * 2] ;7
movq xmm2, [rsi + rax] ;2
movq xmm3, [rsi + rax * 2] ;3
movq xmm4, [rsi + rdx] ;4
movq xmm5, [rsi + rax * 4] ;5
HIGH_APPLY_FILTER_4 1
lea rdi, [rdi + rbx]
dec rcx
jnz .loop
add rsp, 16 * 7
pop rsp
pop rbx
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
global sym(vp9_high_filter_block1d8_v8_avg_sse2) PRIVATE
sym(vp9_high_filter_block1d8_v8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 8
%define k0k1 [rsp + 16 * 0]
%define k6k7 [rsp + 16 * 1]
%define k2k5 [rsp + 16 * 2]
%define k3k4 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define temp [rsp + 16 * 5]
%define max [rsp + 16 * 6]
%define min [rsp + 16 * 7]
HIGH_GET_FILTERS
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
movsxd rbx, DWORD PTR arg(3) ;out_pitch
lea rax, [rax + rax] ;bytes per line
lea rbx, [rbx + rbx]
lea rdx, [rax + rax * 2]
movsxd rcx, DWORD PTR arg(4) ;output_height
.loop:
LOAD_VERT_8 0
HIGH_APPLY_FILTER_8 1, 0
lea rdi, [rdi + rbx]
dec rcx
jnz .loop
add rsp, 16 * 8
pop rsp
pop rbx
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
global sym(vp9_high_filter_block1d16_v8_avg_sse2) PRIVATE
sym(vp9_high_filter_block1d16_v8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 8
%define k0k1 [rsp + 16 * 0]
%define k6k7 [rsp + 16 * 1]
%define k2k5 [rsp + 16 * 2]
%define k3k4 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define temp [rsp + 16 * 5]
%define max [rsp + 16 * 6]
%define min [rsp + 16 * 7]
HIGH_GET_FILTERS
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
movsxd rbx, DWORD PTR arg(3) ;out_pitch
lea rax, [rax + rax] ;bytes per line
lea rbx, [rbx + rbx]
lea rdx, [rax + rax * 2]
movsxd rcx, DWORD PTR arg(4) ;output_height
.loop:
LOAD_VERT_8 0
HIGH_APPLY_FILTER_8 1, 0
sub rsi, rax
LOAD_VERT_8 16
HIGH_APPLY_FILTER_8 1, 16
add rdi, rbx
dec rcx
jnz .loop
add rsp, 16 * 8
pop rsp
pop rbx
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp9_filter_block1d4_h8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned char *output_ptr,
; unsigned int output_pitch,
; unsigned int output_height,
; short *filter
;)
global sym(vp9_high_filter_block1d4_h8_sse2) PRIVATE
sym(vp9_high_filter_block1d4_h8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 7
%define k0k6 [rsp + 16 * 0]
%define k2k5 [rsp + 16 * 1]
%define k3k4 [rsp + 16 * 2]
%define k1k7 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define max [rsp + 16 * 5]
%define min [rsp + 16 * 6]
HIGH_GET_FILTERS_4
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
movsxd rdx, DWORD PTR arg(3) ;out_pitch
lea rax, [rax + rax] ;bytes per line
lea rdx, [rdx + rdx]
movsxd rcx, DWORD PTR arg(4) ;output_height
.loop:
movdqu xmm0, [rsi - 6] ;load src
movdqu xmm4, [rsi + 2]
movdqa xmm1, xmm0
movdqa xmm6, xmm4
movdqa xmm7, xmm4
movdqa xmm2, xmm0
movdqa xmm3, xmm0
movdqa xmm5, xmm4
psrldq xmm1, 2
psrldq xmm6, 4
psrldq xmm7, 6
psrldq xmm2, 4
psrldq xmm3, 6
psrldq xmm5, 2
HIGH_APPLY_FILTER_4 0
lea rsi, [rsi + rax]
lea rdi, [rdi + rdx]
dec rcx
jnz .loop
add rsp, 16 * 7
pop rsp
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp9_filter_block1d8_h8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned char *output_ptr,
; unsigned int output_pitch,
; unsigned int output_height,
; short *filter
;)
global sym(vp9_high_filter_block1d8_h8_sse2) PRIVATE
sym(vp9_high_filter_block1d8_h8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 8
%define k0k1 [rsp + 16 * 0]
%define k6k7 [rsp + 16 * 1]
%define k2k5 [rsp + 16 * 2]
%define k3k4 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define temp [rsp + 16 * 5]
%define max [rsp + 16 * 6]
%define min [rsp + 16 * 7]
HIGH_GET_FILTERS
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
movsxd rdx, DWORD PTR arg(3) ;out_pitch
lea rax, [rax + rax] ;bytes per line
lea rdx, [rdx + rdx]
movsxd rcx, DWORD PTR arg(4) ;output_height
.loop:
movdqu xmm0, [rsi - 6] ;load src
movdqu xmm1, [rsi - 4]
movdqu xmm2, [rsi - 2]
movdqu xmm3, [rsi]
movdqu xmm4, [rsi + 2]
movdqu xmm5, [rsi + 4]
movdqu xmm6, [rsi + 6]
movdqu xmm7, [rsi + 8]
HIGH_APPLY_FILTER_8 0, 0
lea rsi, [rsi + rax]
lea rdi, [rdi + rdx]
dec rcx
jnz .loop
add rsp, 16 * 8
pop rsp
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp9_filter_block1d16_h8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned char *output_ptr,
; unsigned int output_pitch,
; unsigned int output_height,
; short *filter
;)
global sym(vp9_high_filter_block1d16_h8_sse2) PRIVATE
sym(vp9_high_filter_block1d16_h8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 8
%define k0k1 [rsp + 16 * 0]
%define k6k7 [rsp + 16 * 1]
%define k2k5 [rsp + 16 * 2]
%define k3k4 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define temp [rsp + 16 * 5]
%define max [rsp + 16 * 6]
%define min [rsp + 16 * 7]
HIGH_GET_FILTERS
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
movsxd rdx, DWORD PTR arg(3) ;out_pitch
lea rax, [rax + rax] ;bytes per line
lea rdx, [rdx + rdx]
movsxd rcx, DWORD PTR arg(4) ;output_height
.loop:
movdqu xmm0, [rsi - 6] ;load src
movdqu xmm1, [rsi - 4]
movdqu xmm2, [rsi - 2]
movdqu xmm3, [rsi]
movdqu xmm4, [rsi + 2]
movdqu xmm5, [rsi + 4]
movdqu xmm6, [rsi + 6]
movdqu xmm7, [rsi + 8]
HIGH_APPLY_FILTER_8 0, 0
movdqu xmm0, [rsi + 10] ;load src
movdqu xmm1, [rsi + 12]
movdqu xmm2, [rsi + 14]
movdqu xmm3, [rsi + 16]
movdqu xmm4, [rsi + 18]
movdqu xmm5, [rsi + 20]
movdqu xmm6, [rsi + 22]
movdqu xmm7, [rsi + 24]
HIGH_APPLY_FILTER_8 0, 16
lea rsi, [rsi + rax]
lea rdi, [rdi + rdx]
dec rcx
jnz .loop
add rsp, 16 * 8
pop rsp
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
global sym(vp9_high_filter_block1d4_h8_avg_sse2) PRIVATE
sym(vp9_high_filter_block1d4_h8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 7
%define k0k6 [rsp + 16 * 0]
%define k2k5 [rsp + 16 * 1]
%define k3k4 [rsp + 16 * 2]
%define k1k7 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define max [rsp + 16 * 5]
%define min [rsp + 16 * 6]
HIGH_GET_FILTERS_4
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
movsxd rdx, DWORD PTR arg(3) ;out_pitch
lea rax, [rax + rax] ;bytes per line
lea rdx, [rdx + rdx]
movsxd rcx, DWORD PTR arg(4) ;output_height
.loop:
movdqu xmm0, [rsi - 6] ;load src
movdqu xmm4, [rsi + 2]
movdqa xmm1, xmm0
movdqa xmm6, xmm4
movdqa xmm7, xmm4
movdqa xmm2, xmm0
movdqa xmm3, xmm0
movdqa xmm5, xmm4
psrldq xmm1, 2
psrldq xmm6, 4
psrldq xmm7, 6
psrldq xmm2, 4
psrldq xmm3, 6
psrldq xmm5, 2
HIGH_APPLY_FILTER_4 1
lea rsi, [rsi + rax]
lea rdi, [rdi + rdx]
dec rcx
jnz .loop
add rsp, 16 * 7
pop rsp
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
global sym(vp9_high_filter_block1d8_h8_avg_sse2) PRIVATE
sym(vp9_high_filter_block1d8_h8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 8
%define k0k1 [rsp + 16 * 0]
%define k6k7 [rsp + 16 * 1]
%define k2k5 [rsp + 16 * 2]
%define k3k4 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define temp [rsp + 16 * 5]
%define max [rsp + 16 * 6]
%define min [rsp + 16 * 7]
HIGH_GET_FILTERS
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
movsxd rdx, DWORD PTR arg(3) ;out_pitch
lea rax, [rax + rax] ;bytes per line
lea rdx, [rdx + rdx]
movsxd rcx, DWORD PTR arg(4) ;output_height
.loop:
movdqu xmm0, [rsi - 6] ;load src
movdqu xmm1, [rsi - 4]
movdqu xmm2, [rsi - 2]
movdqu xmm3, [rsi]
movdqu xmm4, [rsi + 2]
movdqu xmm5, [rsi + 4]
movdqu xmm6, [rsi + 6]
movdqu xmm7, [rsi + 8]
HIGH_APPLY_FILTER_8 1, 0
lea rsi, [rsi + rax]
lea rdi, [rdi + rdx]
dec rcx
jnz .loop
add rsp, 16 * 8
pop rsp
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
global sym(vp9_high_filter_block1d16_h8_avg_sse2) PRIVATE
sym(vp9_high_filter_block1d16_h8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 8
%define k0k1 [rsp + 16 * 0]
%define k6k7 [rsp + 16 * 1]
%define k2k5 [rsp + 16 * 2]
%define k3k4 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define temp [rsp + 16 * 5]
%define max [rsp + 16 * 6]
%define min [rsp + 16 * 7]
HIGH_GET_FILTERS
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
movsxd rdx, DWORD PTR arg(3) ;out_pitch
lea rax, [rax + rax] ;bytes per line
lea rdx, [rdx + rdx]
movsxd rcx, DWORD PTR arg(4) ;output_height
.loop:
movdqu xmm0, [rsi - 6] ;load src
movdqu xmm1, [rsi - 4]
movdqu xmm2, [rsi - 2]
movdqu xmm3, [rsi]
movdqu xmm4, [rsi + 2]
movdqu xmm5, [rsi + 4]
movdqu xmm6, [rsi + 6]
movdqu xmm7, [rsi + 8]
HIGH_APPLY_FILTER_8 1, 0
movdqu xmm0, [rsi + 10] ;load src
movdqu xmm1, [rsi + 12]
movdqu xmm2, [rsi + 14]
movdqu xmm3, [rsi + 16]
movdqu xmm4, [rsi + 18]
movdqu xmm5, [rsi + 20]
movdqu xmm6, [rsi + 22]
movdqu xmm7, [rsi + 24]
HIGH_APPLY_FILTER_8 1, 16
lea rsi, [rsi + rax]
lea rdi, [rdi + rdx]
dec rcx
jnz .loop
add rsp, 16 * 8
pop rsp
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret

View File

@ -0,0 +1,494 @@
;
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%macro HIGH_GET_PARAM_4 0
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
mov rcx, 0x00000040
movdqa xmm3, [rdx] ;load filters
pshuflw xmm4, xmm3, 11111111b ;k3
psrldq xmm3, 8
pshuflw xmm3, xmm3, 0b ;k4
punpcklwd xmm4, xmm3 ;k3k4
movq xmm3, rcx ;rounding
pshufd xmm3, xmm3, 0
mov rdx, 0x00010001
movsxd rcx, DWORD PTR arg(6) ;bps
movq xmm5, rdx
movq xmm2, rcx
pshufd xmm5, xmm5, 0b
movdqa xmm1, xmm5
psllw xmm5, xmm2
psubw xmm5, xmm1 ;max value (for clamping)
pxor xmm2, xmm2 ;min value (for clamping)
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
movsxd rdx, DWORD PTR arg(3) ;out_pitch
movsxd rcx, DWORD PTR arg(4) ;output_height
%endm
%macro HIGH_APPLY_FILTER_4 1
punpcklwd xmm0, xmm1 ;two row in one register
pmaddwd xmm0, xmm4 ;multiply the filter factors
paddd xmm0, xmm3 ;rounding
psrad xmm0, 7 ;shift
packssdw xmm0, xmm0 ;pack to word
;clamp the values
pminsw xmm0, xmm5
pmaxsw xmm0, xmm2
%if %1
movq xmm1, [rdi]
pavgw xmm0, xmm1
%endif
movq [rdi], xmm0
lea rsi, [rsi + 2*rax]
lea rdi, [rdi + 2*rdx]
dec rcx
%endm
%if ARCH_X86_64
%macro HIGH_GET_PARAM 0
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
mov rcx, 0x00000040
movdqa xmm6, [rdx] ;load filters
pshuflw xmm7, xmm6, 11111111b ;k3
pshufhw xmm6, xmm6, 0b ;k4
psrldq xmm6, 8
punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4
movq xmm4, rcx ;rounding
pshufd xmm4, xmm4, 0
mov rdx, 0x00010001
movsxd rcx, DWORD PTR arg(6) ;bps
movq xmm8, rdx
movq xmm5, rcx
pshufd xmm8, xmm8, 0b
movdqa xmm1, xmm8
psllw xmm8, xmm5
psubw xmm8, xmm1 ;max value (for clamping)
pxor xmm5, xmm5 ;min value (for clamping)
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
movsxd rdx, DWORD PTR arg(3) ;out_pitch
movsxd rcx, DWORD PTR arg(4) ;output_height
%endm
%macro HIGH_APPLY_FILTER_8 1
movdqa xmm6, xmm0
punpckhwd xmm6, xmm1
punpcklwd xmm0, xmm1
pmaddwd xmm6, xmm7
pmaddwd xmm0, xmm7
paddd xmm6, xmm4 ;rounding
paddd xmm0, xmm4 ;rounding
psrad xmm6, 7 ;shift
psrad xmm0, 7 ;shift
packssdw xmm0, xmm6 ;pack back to word
;clamp the values
pminsw xmm0, xmm8
pmaxsw xmm0, xmm5
%if %1
movdqu xmm1, [rdi]
pavgw xmm0, xmm1
%endif
movdqu [rdi], xmm0 ;store the result
lea rsi, [rsi + 2*rax]
lea rdi, [rdi + 2*rdx]
dec rcx
%endm
%macro HIGH_APPLY_FILTER_16 1
movdqa xmm9, xmm0
movdqa xmm6, xmm2
punpckhwd xmm9, xmm1
punpckhwd xmm6, xmm3
punpcklwd xmm0, xmm1
punpcklwd xmm2, xmm3
pmaddwd xmm9, xmm7
pmaddwd xmm6, xmm7
pmaddwd xmm0, xmm7
pmaddwd xmm2, xmm7
paddd xmm9, xmm4 ;rounding
paddd xmm6, xmm4
paddd xmm0, xmm4
paddd xmm2, xmm4
psrad xmm9, 7 ;shift
psrad xmm6, 7
psrad xmm0, 7
psrad xmm2, 7
packssdw xmm0, xmm9 ;pack back to word
packssdw xmm2, xmm6 ;pack back to word
;clamp the values
pminsw xmm0, xmm8
pmaxsw xmm0, xmm5
pminsw xmm2, xmm8
pmaxsw xmm2, xmm5
%if %1
movdqu xmm1, [rdi]
movdqu xmm3, [rdi + 16]
pavgw xmm0, xmm1
pavgw xmm2, xmm3
%endif
movdqu [rdi], xmm0 ;store the result
movdqu [rdi + 16], xmm2 ;store the result
lea rsi, [rsi + 2*rax]
lea rdi, [rdi + 2*rdx]
dec rcx
%endm
%endif
global sym(vp9_high_filter_block1d4_v2_sse2) PRIVATE
sym(vp9_high_filter_block1d4_v2_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
push rsi
push rdi
; end prolog
HIGH_GET_PARAM_4
.loop:
movq xmm0, [rsi] ;load src
movq xmm1, [rsi + 2*rax]
HIGH_APPLY_FILTER_4 0
jnz .loop
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
%if ARCH_X86_64
global sym(vp9_high_filter_block1d8_v2_sse2) PRIVATE
sym(vp9_high_filter_block1d8_v2_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 8
push rsi
push rdi
; end prolog
HIGH_GET_PARAM
.loop:
movdqu xmm0, [rsi] ;0
movdqu xmm1, [rsi + 2*rax] ;1
HIGH_APPLY_FILTER_8 0
jnz .loop
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
global sym(vp9_high_filter_block1d16_v2_sse2) PRIVATE
sym(vp9_high_filter_block1d16_v2_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 9
push rsi
push rdi
; end prolog
HIGH_GET_PARAM
.loop:
movdqu xmm0, [rsi] ;0
movdqu xmm2, [rsi + 16]
movdqu xmm1, [rsi + 2*rax] ;1
movdqu xmm3, [rsi + 2*rax + 16]
HIGH_APPLY_FILTER_16 0
jnz .loop
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
%endif
global sym(vp9_high_filter_block1d4_v2_avg_sse2) PRIVATE
sym(vp9_high_filter_block1d4_v2_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
push rsi
push rdi
; end prolog
HIGH_GET_PARAM_4
.loop:
movq xmm0, [rsi] ;load src
movq xmm1, [rsi + 2*rax]
HIGH_APPLY_FILTER_4 1
jnz .loop
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
%if ARCH_X86_64
global sym(vp9_high_filter_block1d8_v2_avg_sse2) PRIVATE
sym(vp9_high_filter_block1d8_v2_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 8
push rsi
push rdi
; end prolog
HIGH_GET_PARAM
.loop:
movdqu xmm0, [rsi] ;0
movdqu xmm1, [rsi + 2*rax] ;1
HIGH_APPLY_FILTER_8 1
jnz .loop
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
global sym(vp9_high_filter_block1d16_v2_avg_sse2) PRIVATE
sym(vp9_high_filter_block1d16_v2_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 9
push rsi
push rdi
; end prolog
HIGH_GET_PARAM
.loop:
movdqu xmm0, [rsi] ;0
movdqu xmm1, [rsi + 2*rax] ;1
movdqu xmm2, [rsi + 16]
movdqu xmm3, [rsi + 2*rax + 16]
HIGH_APPLY_FILTER_16 1
jnz .loop
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
%endif
global sym(vp9_high_filter_block1d4_h2_sse2) PRIVATE
sym(vp9_high_filter_block1d4_h2_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
push rsi
push rdi
; end prolog
HIGH_GET_PARAM_4
.loop:
movdqu xmm0, [rsi] ;load src
movdqa xmm1, xmm0
psrldq xmm1, 2
HIGH_APPLY_FILTER_4 0
jnz .loop
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
%if ARCH_X86_64
global sym(vp9_high_filter_block1d8_h2_sse2) PRIVATE
sym(vp9_high_filter_block1d8_h2_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 8
push rsi
push rdi
; end prolog
HIGH_GET_PARAM
.loop:
movdqu xmm0, [rsi] ;load src
movdqu xmm1, [rsi + 2]
HIGH_APPLY_FILTER_8 0
jnz .loop
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
global sym(vp9_high_filter_block1d16_h2_sse2) PRIVATE
sym(vp9_high_filter_block1d16_h2_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 9
push rsi
push rdi
; end prolog
HIGH_GET_PARAM
.loop:
movdqu xmm0, [rsi] ;load src
movdqu xmm1, [rsi + 2]
movdqu xmm2, [rsi + 16]
movdqu xmm3, [rsi + 18]
HIGH_APPLY_FILTER_16 0
jnz .loop
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
%endif
global sym(vp9_high_filter_block1d4_h2_avg_sse2) PRIVATE
sym(vp9_high_filter_block1d4_h2_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
push rsi
push rdi
; end prolog
HIGH_GET_PARAM_4
.loop:
movdqu xmm0, [rsi] ;load src
movdqa xmm1, xmm0
psrldq xmm1, 2
HIGH_APPLY_FILTER_4 1
jnz .loop
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
%if ARCH_X86_64
global sym(vp9_high_filter_block1d8_h2_avg_sse2) PRIVATE
sym(vp9_high_filter_block1d8_h2_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 8
push rsi
push rdi
; end prolog
HIGH_GET_PARAM
.loop:
movdqu xmm0, [rsi] ;load src
movdqu xmm1, [rsi + 2]
HIGH_APPLY_FILTER_8 1
jnz .loop
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
global sym(vp9_high_filter_block1d16_h2_avg_sse2) PRIVATE
sym(vp9_high_filter_block1d16_h2_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 9
push rsi
push rdi
; end prolog
HIGH_GET_PARAM
.loop:
movdqu xmm0, [rsi] ;load src
movdqu xmm1, [rsi + 2]
movdqu xmm2, [rsi + 16]
movdqu xmm3, [rsi + 18]
HIGH_APPLY_FILTER_16 1
jnz .loop
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
%endif

View File

@ -1265,10 +1265,18 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
for (i = 0; i < REFS_PER_FRAME; ++i) {
RefBuffer *const ref_buf = &cm->frame_refs[i];
#if CONFIG_VP9_HIGHBITDEPTH
vp9_setup_scale_factors_for_frame(&ref_buf->sf,
ref_buf->buf->y_crop_width,
ref_buf->buf->y_crop_height,
cm->width, cm->height,
cm->use_highbitdepth);
#else
vp9_setup_scale_factors_for_frame(&ref_buf->sf,
ref_buf->buf->y_crop_width,
ref_buf->buf->y_crop_height,
cm->width, cm->height);
#endif
if (vp9_is_scaled(&ref_buf->sf))
vp9_extend_frame_borders(ref_buf->buf);
}

View File

@ -2767,10 +2767,17 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
ref_buf->buf = buf;
ref_buf->idx = idx;
#if CONFIG_VP9_HIGHBITDEPTH
vp9_setup_scale_factors_for_frame(&ref_buf->sf,
buf->y_crop_width, buf->y_crop_height,
cm->width, cm->height,
(buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
1 : 0);
#else
vp9_setup_scale_factors_for_frame(&ref_buf->sf,
buf->y_crop_width, buf->y_crop_height,
cm->width, cm->height);
#endif
if (vp9_is_scaled(&ref_buf->sf))
vp9_extend_frame_borders(buf);
}

View File

@ -454,12 +454,20 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
// In spatial svc the scaling factors might be less then 1/2. So we will use
// non-normative scaling.
int frame_used = 0;
#if CONFIG_VP9_HIGHBITDEPTH
vp9_setup_scale_factors_for_frame(&sf,
get_frame_new_buffer(cm)->y_crop_width,
get_frame_new_buffer(cm)->y_crop_height,
get_frame_new_buffer(cm)->y_crop_width,
get_frame_new_buffer(cm)->y_crop_height,
cm->use_highbitdepth);
#else
vp9_setup_scale_factors_for_frame(&sf,
get_frame_new_buffer(cm)->y_crop_width,
get_frame_new_buffer(cm)->y_crop_height,
get_frame_new_buffer(cm)->y_crop_width,
get_frame_new_buffer(cm)->y_crop_height);
#endif
for (frame = 0; frame < frames_to_blur; ++frame) {
if (cm->mi_cols * MI_SIZE != frames[frame]->y_width ||
cm->mi_rows * MI_SIZE != frames[frame]->y_height) {
@ -481,11 +489,20 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
}
} else {
// ARF is produced at the native frame size and resized when coded.
#if CONFIG_VP9_HIGHBITDEPTH
vp9_setup_scale_factors_for_frame(&sf,
frames[0]->y_crop_width,
frames[0]->y_crop_height,
frames[0]->y_crop_width,
frames[0]->y_crop_height,
cm->use_highbitdepth);
#else
vp9_setup_scale_factors_for_frame(&sf,
frames[0]->y_crop_width,
frames[0]->y_crop_height,
frames[0]->y_crop_width,
frames[0]->y_crop_height);
#endif
}
temporal_filter_iterate_c(cpi, frames, frames_to_blur,

View File

@ -91,6 +91,8 @@ endif
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_intrapred_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_8t_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_bilinear_sse2.asm
endif
# common (c)