post-proc: deblock filter optimization
1. Algorithm modification: Instead of having same filter threshold for a whole frame, now we allow the thresholds to be adjusted for each macroblock. In current implementation, to avoid excessive blur on background as reported in issue480(http://code.google.com/p/webm/issues/detail?id=480), we reduce the thresholds for skipped macroblocks. 2. SSE2 optimization: As started in issue479(http://code.google.com/p/webm/issues/detail?id=479), the filter calculation was adjusted for better performance. The c code was also modified accordingly. This made the deblock filter 2x faster, and the decoder was 1.2x faster overall. Next, the demacroblock filter will be modified similarly. Change-Id: I05e54c3f580ccd427487d085096b3174f2ab7e86
This commit is contained in:
parent
9704cdec9f
commit
4c53bacce4
@ -19,9 +19,9 @@ typedef void (*post_proc_func_t)(unsigned char *src_ptr,
|
||||
unsigned char *dst_ptr,
|
||||
int src_pixels_per_line,
|
||||
int dst_pixels_per_line,
|
||||
int rows,
|
||||
int cols,
|
||||
int flimit);
|
||||
unsigned char *flimit,
|
||||
int size);
|
||||
|
||||
namespace {
|
||||
|
||||
@ -29,7 +29,7 @@ class Vp8PostProcessingFilterTest
|
||||
: public ::testing::TestWithParam<post_proc_func_t> {};
|
||||
|
||||
// Test routine for the VP8 post-processing function
|
||||
// vp8_post_proc_down_and_across_c.
|
||||
// vp8_post_proc_down_and_across_mb_row_c.
|
||||
|
||||
TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
|
||||
// Size of the underlying data block that will be filtered.
|
||||
@ -56,6 +56,8 @@ TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
|
||||
// Pointers to top-left pixel of block in the input and output images.
|
||||
uint8_t *const src_image_ptr = src_image + (input_stride << 1);
|
||||
uint8_t *const dst_image_ptr = dst_image + 8;
|
||||
uint8_t *const flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
|
||||
(void)vpx_memset(flimits, 255, block_width);
|
||||
|
||||
// Initialize pixels in the input:
|
||||
// block pixels to value 1,
|
||||
@ -73,14 +75,13 @@ TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
|
||||
(void)vpx_memset(dst_image, 99, output_size);
|
||||
|
||||
GetParam()(src_image_ptr, dst_image_ptr, input_stride,
|
||||
output_stride, block_height, block_width,
|
||||
255);
|
||||
output_stride, block_width, flimits, 16);
|
||||
|
||||
static const uint8_t expected_data[block_height] = {
|
||||
3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3
|
||||
4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4
|
||||
};
|
||||
|
||||
pixel_ptr = dst_image;
|
||||
pixel_ptr = dst_image_ptr;
|
||||
for (int i = 0; i < block_height; ++i) {
|
||||
for (int j = 0; j < block_width; ++j) {
|
||||
EXPECT_EQ(expected_data[i], pixel_ptr[j])
|
||||
@ -91,19 +92,15 @@ TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
|
||||
|
||||
vpx_free(src_image);
|
||||
vpx_free(dst_image);
|
||||
vpx_free(flimits);
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(C, Vp8PostProcessingFilterTest,
|
||||
::testing::Values(vp8_post_proc_down_and_across_c));
|
||||
|
||||
#if HAVE_MMX
|
||||
INSTANTIATE_TEST_CASE_P(MMX, Vp8PostProcessingFilterTest,
|
||||
::testing::Values(vp8_post_proc_down_and_across_mmx));
|
||||
#endif
|
||||
::testing::Values(vp8_post_proc_down_and_across_mb_row_c));
|
||||
|
||||
#if HAVE_SSE2
|
||||
INSTANTIATE_TEST_CASE_P(SSE2, Vp8PostProcessingFilterTest,
|
||||
::testing::Values(vp8_post_proc_down_and_across_xmm));
|
||||
::testing::Values(vp8_post_proc_down_and_across_mb_row_sse2));
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
|
@ -127,25 +127,24 @@ extern void vp8_blit_text(const char *msg, unsigned char *address, const int pit
|
||||
extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch);
|
||||
/***********************************************************************************************************
|
||||
*/
|
||||
void vp8_post_proc_down_and_across_c
|
||||
void vp8_post_proc_down_and_across_mb_row_c
|
||||
(
|
||||
unsigned char *src_ptr,
|
||||
unsigned char *dst_ptr,
|
||||
int src_pixels_per_line,
|
||||
int dst_pixels_per_line,
|
||||
int rows,
|
||||
int cols,
|
||||
int flimit
|
||||
unsigned char *f,
|
||||
int size
|
||||
)
|
||||
{
|
||||
unsigned char *p_src, *p_dst;
|
||||
int row;
|
||||
int col;
|
||||
int i;
|
||||
int v;
|
||||
unsigned char d[8];
|
||||
unsigned char v;
|
||||
unsigned char d[4];
|
||||
|
||||
for (row = 0; row < rows; row++)
|
||||
for (row = 0; row < size; row++)
|
||||
{
|
||||
/* post_proc_down for one row */
|
||||
p_src = src_ptr;
|
||||
@ -153,20 +152,23 @@ void vp8_post_proc_down_and_across_c
|
||||
|
||||
for (col = 0; col < cols; col++)
|
||||
{
|
||||
unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
|
||||
unsigned char p_above1 = p_src[col - src_pixels_per_line];
|
||||
unsigned char p_below1 = p_src[col + src_pixels_per_line];
|
||||
unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
|
||||
|
||||
int kernel = 4;
|
||||
int v = p_src[col];
|
||||
v = p_src[col];
|
||||
|
||||
for (i = -2; i <= 2; i++)
|
||||
if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col])
|
||||
&& (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col]))
|
||||
{
|
||||
if (abs(v - p_src[col+i*src_pixels_per_line]) > flimit)
|
||||
goto down_skip_convolve;
|
||||
|
||||
kernel += kernel5[2+i] * p_src[col+i*src_pixels_per_line];
|
||||
unsigned char k1, k2, k3;
|
||||
k1 = (p_above2 + p_above1 + 1) >> 1;
|
||||
k2 = (p_below2 + p_below1 + 1) >> 1;
|
||||
k3 = (k1 + k2 + 1) >> 1;
|
||||
v = (k3 + v + 1) >> 1;
|
||||
}
|
||||
|
||||
v = (kernel >> 3);
|
||||
down_skip_convolve:
|
||||
p_dst[col] = v;
|
||||
}
|
||||
|
||||
@ -174,40 +176,34 @@ void vp8_post_proc_down_and_across_c
|
||||
p_src = dst_ptr;
|
||||
p_dst = dst_ptr;
|
||||
|
||||
for (i = -8; i<0; i++)
|
||||
p_src[i]=p_src[0];
|
||||
|
||||
for (i = cols; i<cols+8; i++)
|
||||
p_src[i]=p_src[cols-1];
|
||||
|
||||
for (i = 0; i < 8; i++)
|
||||
d[i] = p_src[i];
|
||||
p_src[-2] = p_src[-1] = p_src[0];
|
||||
p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
|
||||
|
||||
for (col = 0; col < cols; col++)
|
||||
{
|
||||
int kernel = 4;
|
||||
v = p_src[col];
|
||||
|
||||
d[col&7] = v;
|
||||
|
||||
for (i = -2; i <= 2; i++)
|
||||
if ((abs(v - p_src[col - 2]) < f[col])
|
||||
&& (abs(v - p_src[col - 1]) < f[col])
|
||||
&& (abs(v - p_src[col + 1]) < f[col])
|
||||
&& (abs(v - p_src[col + 2]) < f[col]))
|
||||
{
|
||||
if (abs(v - p_src[col+i]) > flimit)
|
||||
goto across_skip_convolve;
|
||||
|
||||
kernel += kernel5[2+i] * p_src[col+i];
|
||||
unsigned char k1, k2, k3;
|
||||
k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
|
||||
k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
|
||||
k3 = (k1 + k2 + 1) >> 1;
|
||||
v = (k3 + v + 1) >> 1;
|
||||
}
|
||||
|
||||
d[col&7] = (kernel >> 3);
|
||||
across_skip_convolve:
|
||||
d[col & 3] = v;
|
||||
|
||||
if (col >= 2)
|
||||
p_dst[col-2] = d[(col-2)&7];
|
||||
p_dst[col - 2] = d[(col - 2) & 3];
|
||||
}
|
||||
|
||||
/* handle the last two pixels */
|
||||
p_dst[col-2] = d[(col-2)&7];
|
||||
p_dst[col-1] = d[(col-1)&7];
|
||||
p_dst[col - 2] = d[(col - 2) & 3];
|
||||
p_dst[col - 1] = d[(col - 1) & 3];
|
||||
|
||||
/* next row */
|
||||
src_ptr += src_pixels_per_line;
|
||||
@ -318,28 +314,17 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, i
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void vp8_deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source,
|
||||
YV12_BUFFER_CONFIG *post,
|
||||
int q,
|
||||
int low_var_thresh,
|
||||
int flag)
|
||||
static void vp8_de_mblock(YV12_BUFFER_CONFIG *post,
|
||||
int q)
|
||||
{
|
||||
double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
|
||||
int ppl = (int)(level + .5);
|
||||
(void) low_var_thresh;
|
||||
(void) flag;
|
||||
|
||||
vp8_post_proc_down_and_across(source->y_buffer, post->y_buffer, source->y_stride, post->y_stride, source->y_height, source->y_width, ppl);
|
||||
vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
|
||||
vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
|
||||
|
||||
vp8_post_proc_down_and_across(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
|
||||
vp8_post_proc_down_and_across(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
|
||||
|
||||
vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
|
||||
post->y_width, q2mbl(q));
|
||||
vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
|
||||
post->y_width, q2mbl(q));
|
||||
}
|
||||
|
||||
void vp8_deblock(YV12_BUFFER_CONFIG *source,
|
||||
void vp8_deblock(VP8_COMMON *cm,
|
||||
YV12_BUFFER_CONFIG *source,
|
||||
YV12_BUFFER_CONFIG *post,
|
||||
int q,
|
||||
int low_var_thresh,
|
||||
@ -347,12 +332,58 @@ void vp8_deblock(YV12_BUFFER_CONFIG *source,
|
||||
{
|
||||
double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
|
||||
int ppl = (int)(level + .5);
|
||||
|
||||
const MODE_INFO *mode_info_context = cm->mi;
|
||||
int mbr, mbc;
|
||||
|
||||
/* The pixel thresholds are adjusted according to if or not the macroblock
|
||||
* is a skipped block. */
|
||||
unsigned char *ylimits = (unsigned char *)vpx_memalign(16, 16 * cm->mb_cols);
|
||||
unsigned char *uvlimits = (unsigned char *)vpx_memalign(16, 8 * cm->mb_cols);
|
||||
(void) low_var_thresh;
|
||||
(void) flag;
|
||||
|
||||
vp8_post_proc_down_and_across(source->y_buffer, post->y_buffer, source->y_stride, post->y_stride, source->y_height, source->y_width, ppl);
|
||||
vp8_post_proc_down_and_across(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
|
||||
vp8_post_proc_down_and_across(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
|
||||
if (ppl > 0)
|
||||
{
|
||||
for (mbr = 0; mbr < cm->mb_rows; mbr++)
|
||||
{
|
||||
unsigned char *ylptr = ylimits;
|
||||
unsigned char *uvlptr = uvlimits;
|
||||
for (mbc = 0; mbc < cm->mb_cols; mbc++)
|
||||
{
|
||||
unsigned char mb_ppl;
|
||||
|
||||
if (mode_info_context->mbmi.mb_skip_coeff)
|
||||
mb_ppl = (unsigned char)ppl >> 1;
|
||||
else
|
||||
mb_ppl = (unsigned char)ppl;
|
||||
|
||||
vpx_memset(ylptr, mb_ppl, 16);
|
||||
vpx_memset(uvlptr, mb_ppl, 8);
|
||||
|
||||
ylptr += 16;
|
||||
uvlptr += 8;
|
||||
mode_info_context++;
|
||||
}
|
||||
mode_info_context++;
|
||||
|
||||
vp8_post_proc_down_and_across_mb_row(
|
||||
source->y_buffer + 16 * mbr * source->y_stride,
|
||||
post->y_buffer + 16 * mbr * post->y_stride, source->y_stride,
|
||||
post->y_stride, source->y_width, ylimits, 16);
|
||||
|
||||
vp8_post_proc_down_and_across_mb_row(
|
||||
source->u_buffer + 8 * mbr * source->uv_stride,
|
||||
post->u_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
|
||||
post->uv_stride, source->uv_width, uvlimits, 8);
|
||||
vp8_post_proc_down_and_across_mb_row(
|
||||
source->v_buffer + 8 * mbr * source->uv_stride,
|
||||
post->v_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
|
||||
post->uv_stride, source->uv_width, uvlimits, 8);
|
||||
}
|
||||
}
|
||||
vpx_free(ylimits);
|
||||
vpx_free(uvlimits);
|
||||
}
|
||||
|
||||
#if !(CONFIG_TEMPORAL_DENOISING)
|
||||
@ -364,33 +395,35 @@ void vp8_de_noise(YV12_BUFFER_CONFIG *source,
|
||||
{
|
||||
double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
|
||||
int ppl = (int)(level + .5);
|
||||
int mb_rows = source->y_width >> 4;
|
||||
int mb_cols = source->y_height >> 4;
|
||||
unsigned char *limits = (unsigned char *)vpx_memalign(16, 16 * mb_cols);
|
||||
int mbr, mbc;
|
||||
(void) post;
|
||||
(void) low_var_thresh;
|
||||
(void) flag;
|
||||
|
||||
vp8_post_proc_down_and_across(
|
||||
source->y_buffer + 2 * source->y_stride + 2,
|
||||
source->y_buffer + 2 * source->y_stride + 2,
|
||||
source->y_stride,
|
||||
source->y_stride,
|
||||
source->y_height - 4,
|
||||
source->y_width - 4,
|
||||
ppl);
|
||||
vp8_post_proc_down_and_across(
|
||||
source->u_buffer + 2 * source->uv_stride + 2,
|
||||
source->u_buffer + 2 * source->uv_stride + 2,
|
||||
source->uv_stride,
|
||||
source->uv_stride,
|
||||
source->uv_height - 4,
|
||||
source->uv_width - 4, ppl);
|
||||
vp8_post_proc_down_and_across(
|
||||
source->v_buffer + 2 * source->uv_stride + 2,
|
||||
source->v_buffer + 2 * source->uv_stride + 2,
|
||||
source->uv_stride,
|
||||
source->uv_stride,
|
||||
source->uv_height - 4,
|
||||
source->uv_width - 4, ppl);
|
||||
/* TODO: The original code don't filter the 2 outer rows and columns. */
|
||||
vpx_memset(limits, (unsigned char)ppl, 16 * mb_cols);
|
||||
|
||||
for (mbr = 0; mbr < mb_rows; mbr++)
|
||||
{
|
||||
vp8_post_proc_down_and_across_mb_row(
|
||||
source->y_buffer + 16 * mbr * source->y_stride,
|
||||
source->y_buffer + 16 * mbr * source->y_stride,
|
||||
source->y_stride, source->y_stride, source->y_width, limits, 16);
|
||||
|
||||
vp8_post_proc_down_and_across_mb_row(
|
||||
source->u_buffer + 8 * mbr * source->uv_stride,
|
||||
source->u_buffer + 8 * mbr * source->uv_stride,
|
||||
source->uv_stride, source->uv_stride, source->uv_width, limits, 8);
|
||||
vp8_post_proc_down_and_across_mb_row(
|
||||
source->v_buffer + 8 * mbr * source->uv_stride,
|
||||
source->v_buffer + 8 * mbr * source->uv_stride,
|
||||
source->uv_stride, source->uv_stride, source->uv_width, limits, 8);
|
||||
}
|
||||
|
||||
vpx_free(limits);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -752,12 +785,14 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
|
||||
vp8_yv12_copy_frame(&oci->post_proc_buffer, &oci->post_proc_buffer_int);
|
||||
if (flags & VP8D_DEMACROBLOCK)
|
||||
{
|
||||
vp8_deblock_and_de_macro_block(&oci->post_proc_buffer_int, &oci->post_proc_buffer,
|
||||
vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer,
|
||||
q + (deblock_level - 5) * 10, 1, 0);
|
||||
vp8_de_mblock(&oci->post_proc_buffer,
|
||||
q + (deblock_level - 5) * 10);
|
||||
}
|
||||
else if (flags & VP8D_DEBLOCK)
|
||||
{
|
||||
vp8_deblock(&oci->post_proc_buffer_int, &oci->post_proc_buffer,
|
||||
vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer,
|
||||
q, 1, 0);
|
||||
}
|
||||
}
|
||||
@ -766,13 +801,15 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
|
||||
}
|
||||
else if (flags & VP8D_DEMACROBLOCK)
|
||||
{
|
||||
vp8_deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
|
||||
q + (deblock_level - 5) * 10, 1, 0);
|
||||
vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer,
|
||||
q + (deblock_level - 5) * 10, 1, 0);
|
||||
vp8_de_mblock(&oci->post_proc_buffer, q + (deblock_level - 5) * 10);
|
||||
|
||||
oci->postproc_state.last_base_qindex = oci->base_qindex;
|
||||
}
|
||||
else if (flags & VP8D_DEBLOCK)
|
||||
{
|
||||
vp8_deblock(oci->frame_to_show, &oci->post_proc_buffer,
|
||||
vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer,
|
||||
q, 1, 0);
|
||||
oci->postproc_state.last_base_qindex = oci->base_qindex;
|
||||
}
|
||||
|
@ -36,7 +36,8 @@ void vp8_de_noise(YV12_BUFFER_CONFIG *source,
|
||||
int low_var_thresh,
|
||||
int flag);
|
||||
|
||||
void vp8_deblock(YV12_BUFFER_CONFIG *source,
|
||||
void vp8_deblock(struct VP8Common *oci,
|
||||
YV12_BUFFER_CONFIG *source,
|
||||
YV12_BUFFER_CONFIG *post,
|
||||
int q,
|
||||
int low_var_thresh,
|
||||
|
@ -19,14 +19,14 @@ void (*vp8_short_idct4x4)(short *input, short *output, int pitch);
|
||||
void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);
|
||||
void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);
|
||||
|
||||
extern void (*vp8_post_proc_down_and_across)(
|
||||
extern void (*vp8_post_proc_down_and_across_mb_row)(
|
||||
unsigned char *src_ptr,
|
||||
unsigned char *dst_ptr,
|
||||
int src_pixels_per_line,
|
||||
int dst_pixels_per_line,
|
||||
int rows,
|
||||
int cols,
|
||||
int flimit
|
||||
unsigned char *f,
|
||||
int size
|
||||
);
|
||||
|
||||
extern void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);
|
||||
@ -34,15 +34,15 @@ extern void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int
|
||||
extern void (*vp8_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);
|
||||
extern void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);
|
||||
|
||||
extern void vp8_post_proc_down_and_across_c
|
||||
extern void vp8_post_proc_down_and_across_mb_row_c
|
||||
(
|
||||
unsigned char *src_ptr,
|
||||
unsigned char *dst_ptr,
|
||||
int src_pixels_per_line,
|
||||
int dst_pixels_per_line,
|
||||
int rows,
|
||||
int cols,
|
||||
int flimit
|
||||
unsigned char *f,
|
||||
int size
|
||||
);
|
||||
void vp8_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);
|
||||
|
||||
@ -158,7 +158,7 @@ void vp8_machine_specific_config(void)
|
||||
vp8_lf_mbhsimple = loop_filter_mbhs_ppc;
|
||||
vp8_lf_bhsimple = loop_filter_bhs_ppc;
|
||||
|
||||
vp8_post_proc_down_and_across = vp8_post_proc_down_and_across_c;
|
||||
vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c;
|
||||
vp8_mbpost_proc_down = vp8_mbpost_proc_down_c;
|
||||
vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_c;
|
||||
vp8_plane_add_noise = vp8_plane_add_noise_c;
|
||||
|
@ -162,9 +162,8 @@ if [ "$CONFIG_POSTPROC" = "yes" ]; then
|
||||
specialize vp8_mbpost_proc_across_ip sse2
|
||||
vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm
|
||||
|
||||
prototype void vp8_post_proc_down_and_across "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int rows, int cols, int flimit"
|
||||
specialize vp8_post_proc_down_and_across mmx sse2
|
||||
vp8_post_proc_down_and_across_sse2=vp8_post_proc_down_and_across_xmm
|
||||
prototype void vp8_post_proc_down_and_across_mb_row "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size"
|
||||
specialize vp8_post_proc_down_and_across_mb_row sse2
|
||||
|
||||
prototype void vp8_plane_add_noise "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch"
|
||||
specialize vp8_plane_add_noise mmx sse2
|
||||
|
@ -14,271 +14,6 @@
|
||||
%define VP8_FILTER_WEIGHT 128
|
||||
%define VP8_FILTER_SHIFT 7
|
||||
|
||||
;void vp8_post_proc_down_and_across_mmx
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; unsigned char *dst_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; int dst_pixels_per_line,
|
||||
; int rows,
|
||||
; int cols,
|
||||
; int flimit
|
||||
;)
|
||||
global sym(vp8_post_proc_down_and_across_mmx) PRIVATE
|
||||
sym(vp8_post_proc_down_and_across_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
|
||||
; move the global rd onto the stack, since we don't have enough registers
|
||||
; to do PIC addressing
|
||||
movq mm0, [GLOBAL(rd)]
|
||||
sub rsp, 8
|
||||
movq [rsp], mm0
|
||||
%define RD [rsp]
|
||||
%else
|
||||
%define RD [GLOBAL(rd)]
|
||||
%endif
|
||||
|
||||
push rbx
|
||||
lea rbx, [GLOBAL(Blur)]
|
||||
movd mm2, dword ptr arg(6) ;flimit
|
||||
punpcklwd mm2, mm2
|
||||
punpckldq mm2, mm2
|
||||
|
||||
mov rsi, arg(0) ;src_ptr
|
||||
mov rdi, arg(1) ;dst_ptr
|
||||
|
||||
movsxd rcx, DWORD PTR arg(4) ;rows
|
||||
movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
|
||||
pxor mm0, mm0 ; mm0 = 00000000
|
||||
|
||||
.nextrow:
|
||||
|
||||
xor rdx, rdx ; clear out rdx for use as loop counter
|
||||
.nextcol:
|
||||
|
||||
pxor mm7, mm7 ; mm7 = 00000000
|
||||
movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps
|
||||
movq mm3, [rsi] ; mm4 = r0 p0..p7
|
||||
punpcklbw mm3, mm0 ; mm3 = p0..p3
|
||||
movq mm1, mm3 ; mm1 = p0..p3
|
||||
pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
|
||||
|
||||
movq mm6, [rbx + 48] ; mm6 = kernel 3 taps
|
||||
movq mm5, [rsi + rax] ; mm4 = r1 p0..p7
|
||||
punpcklbw mm5, mm0 ; mm5 = r1 p0..p3
|
||||
pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers
|
||||
paddusw mm3, mm6 ; mm3 += mm6
|
||||
|
||||
; thresholding
|
||||
movq mm7, mm1 ; mm7 = r0 p0..p3
|
||||
psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3
|
||||
psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3
|
||||
paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
|
||||
pcmpgtw mm7, mm2
|
||||
|
||||
movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers
|
||||
movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7
|
||||
punpcklbw mm5, mm0 ; mm5 = r2 p0..p3
|
||||
pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
|
||||
paddusw mm3, mm6 ; mm3 += mm5
|
||||
|
||||
; thresholding
|
||||
movq mm6, mm1 ; mm6 = r0 p0..p3
|
||||
psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3
|
||||
psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3
|
||||
paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
|
||||
pcmpgtw mm6, mm2
|
||||
por mm7, mm6 ; accumulate thresholds
|
||||
|
||||
|
||||
neg rax
|
||||
movq mm6, [rbx ] ; kernel 0 taps
|
||||
movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7
|
||||
punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3
|
||||
pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
|
||||
paddusw mm3, mm6 ; mm3 += mm5
|
||||
|
||||
; thresholding
|
||||
movq mm6, mm1 ; mm6 = r0 p0..p3
|
||||
psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3
|
||||
psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3
|
||||
paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
|
||||
pcmpgtw mm6, mm2
|
||||
por mm7, mm6 ; accumulate thresholds
|
||||
|
||||
movq mm6, [rbx + 16] ; kernel 1 taps
|
||||
movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7
|
||||
punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3
|
||||
pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
|
||||
paddusw mm3, mm6 ; mm3 += mm5
|
||||
|
||||
; thresholding
|
||||
movq mm6, mm1 ; mm6 = r0 p0..p3
|
||||
psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3
|
||||
psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3
|
||||
paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
|
||||
pcmpgtw mm6, mm2
|
||||
por mm7, mm6 ; accumulate thresholds
|
||||
|
||||
|
||||
paddusw mm3, RD ; mm3 += round value
|
||||
psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
|
||||
|
||||
pand mm1, mm7 ; mm1 select vals > thresh from source
|
||||
pandn mm7, mm3 ; mm7 select vals < thresh from blurred result
|
||||
paddusw mm1, mm7 ; combination
|
||||
|
||||
packuswb mm1, mm0 ; pack to bytes
|
||||
|
||||
movd [rdi], mm1 ;
|
||||
neg rax ; pitch is positive
|
||||
|
||||
|
||||
add rsi, 4
|
||||
add rdi, 4
|
||||
add rdx, 4
|
||||
|
||||
cmp edx, dword ptr arg(5) ;cols
|
||||
jl .nextcol
|
||||
; done with the all cols, start the across filtering in place
|
||||
sub rsi, rdx
|
||||
sub rdi, rdx
|
||||
|
||||
; dup the first byte into the left border 8 times
|
||||
movq mm1, [rdi]
|
||||
punpcklbw mm1, mm1
|
||||
punpcklwd mm1, mm1
|
||||
punpckldq mm1, mm1
|
||||
|
||||
mov rdx, -8
|
||||
movq [rdi+rdx], mm1
|
||||
|
||||
; dup the last byte into the right border
|
||||
movsxd rdx, dword arg(5)
|
||||
movq mm1, [rdi + rdx + -1]
|
||||
punpcklbw mm1, mm1
|
||||
punpcklwd mm1, mm1
|
||||
punpckldq mm1, mm1
|
||||
movq [rdi+rdx], mm1
|
||||
|
||||
|
||||
push rax
|
||||
xor rdx, rdx
|
||||
mov rax, [rdi-4];
|
||||
|
||||
.acrossnextcol:
|
||||
pxor mm7, mm7 ; mm7 = 00000000
|
||||
movq mm6, [rbx + 32 ] ;
|
||||
movq mm4, [rdi+rdx] ; mm4 = p0..p7
|
||||
movq mm3, mm4 ; mm3 = p0..p7
|
||||
punpcklbw mm3, mm0 ; mm3 = p0..p3
|
||||
movq mm1, mm3 ; mm1 = p0..p3
|
||||
pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
|
||||
|
||||
movq mm6, [rbx + 48]
|
||||
psrlq mm4, 8 ; mm4 = p1..p7
|
||||
movq mm5, mm4 ; mm5 = p1..p7
|
||||
punpcklbw mm5, mm0 ; mm5 = p1..p4
|
||||
pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers
|
||||
paddusw mm3, mm6 ; mm3 += mm6
|
||||
|
||||
; thresholding
|
||||
movq mm7, mm1 ; mm7 = p0..p3
|
||||
psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4
|
||||
psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
|
||||
paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4)
|
||||
pcmpgtw mm7, mm2
|
||||
|
||||
movq mm6, [rbx + 64 ]
|
||||
psrlq mm4, 8 ; mm4 = p2..p7
|
||||
movq mm5, mm4 ; mm5 = p2..p7
|
||||
punpcklbw mm5, mm0 ; mm5 = p2..p5
|
||||
pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
|
||||
paddusw mm3, mm6 ; mm3 += mm5
|
||||
|
||||
; thresholding
|
||||
movq mm6, mm1 ; mm6 = p0..p3
|
||||
psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4
|
||||
psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
|
||||
paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)
|
||||
pcmpgtw mm6, mm2
|
||||
por mm7, mm6 ; accumulate thresholds
|
||||
|
||||
|
||||
movq mm6, [rbx ]
|
||||
movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5
|
||||
movq mm5, mm4 ; mm5 = p-2..p5
|
||||
punpcklbw mm5, mm0 ; mm5 = p-2..p1
|
||||
pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
|
||||
paddusw mm3, mm6 ; mm3 += mm5
|
||||
|
||||
; thresholding
|
||||
movq mm6, mm1 ; mm6 = p0..p3
|
||||
psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4
|
||||
psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
|
||||
paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)
|
||||
pcmpgtw mm6, mm2
|
||||
por mm7, mm6 ; accumulate thresholds
|
||||
|
||||
movq mm6, [rbx + 16]
|
||||
psrlq mm4, 8 ; mm4 = p-1..p5
|
||||
punpcklbw mm4, mm0 ; mm4 = p-1..p2
|
||||
pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
|
||||
paddusw mm3, mm6 ; mm3 += mm5
|
||||
|
||||
; thresholding
|
||||
movq mm6, mm1 ; mm6 = p0..p3
|
||||
psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4
|
||||
psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3
|
||||
paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4)
|
||||
pcmpgtw mm6, mm2
|
||||
por mm7, mm6 ; accumulate thresholds
|
||||
|
||||
paddusw mm3, RD ; mm3 += round value
|
||||
psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
|
||||
|
||||
pand mm1, mm7 ; mm1 select vals > thresh from source
|
||||
pandn mm7, mm3 ; mm7 select vals < thresh from blurred result
|
||||
paddusw mm1, mm7 ; combination
|
||||
|
||||
packuswb mm1, mm0 ; pack to bytes
|
||||
mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes
|
||||
movd eax, mm1
|
||||
|
||||
add rdx, 4
|
||||
cmp edx, dword ptr arg(5) ;cols
|
||||
jl .acrossnextcol;
|
||||
|
||||
mov DWORD PTR [rdi+rdx-4], eax
|
||||
pop rax
|
||||
|
||||
; done with this rwo
|
||||
add rsi,rax ; next line
|
||||
movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
|
||||
add rdi,rax ; next destination
|
||||
movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
|
||||
|
||||
dec rcx ; decrement count
|
||||
jnz .nextrow ; next row
|
||||
pop rbx
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
%undef RD
|
||||
|
||||
|
||||
;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
|
||||
; int pitch, int rows, int cols,int flimit)
|
||||
extern sym(vp8_rv)
|
||||
|
@ -11,146 +11,158 @@
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
;void vp8_post_proc_down_and_across_xmm
|
||||
;macro in deblock functions
|
||||
%macro FIRST_2_ROWS 0
|
||||
movdqa xmm4, xmm0
|
||||
movdqa xmm6, xmm0
|
||||
movdqa xmm5, xmm1
|
||||
pavgb xmm5, xmm3
|
||||
|
||||
;calculate absolute value
|
||||
psubusb xmm4, xmm1
|
||||
psubusb xmm1, xmm0
|
||||
psubusb xmm6, xmm3
|
||||
psubusb xmm3, xmm0
|
||||
paddusb xmm4, xmm1
|
||||
paddusb xmm6, xmm3
|
||||
|
||||
;get threshold
|
||||
movdqa xmm2, flimit
|
||||
pxor xmm1, xmm1
|
||||
movdqa xmm7, xmm2
|
||||
|
||||
;get mask
|
||||
psubusb xmm2, xmm4
|
||||
psubusb xmm7, xmm6
|
||||
pcmpeqb xmm2, xmm1
|
||||
pcmpeqb xmm7, xmm1
|
||||
por xmm7, xmm2
|
||||
%endmacro
|
||||
|
||||
%macro SECOND_2_ROWS 0
|
||||
movdqa xmm6, xmm0
|
||||
movdqa xmm4, xmm0
|
||||
movdqa xmm2, xmm1
|
||||
pavgb xmm1, xmm3
|
||||
|
||||
;calculate absolute value
|
||||
psubusb xmm6, xmm2
|
||||
psubusb xmm2, xmm0
|
||||
psubusb xmm4, xmm3
|
||||
psubusb xmm3, xmm0
|
||||
paddusb xmm6, xmm2
|
||||
paddusb xmm4, xmm3
|
||||
|
||||
pavgb xmm5, xmm1
|
||||
|
||||
;get threshold
|
||||
movdqa xmm2, flimit
|
||||
pxor xmm1, xmm1
|
||||
movdqa xmm3, xmm2
|
||||
|
||||
;get mask
|
||||
psubusb xmm2, xmm6
|
||||
psubusb xmm3, xmm4
|
||||
pcmpeqb xmm2, xmm1
|
||||
pcmpeqb xmm3, xmm1
|
||||
|
||||
por xmm7, xmm2
|
||||
por xmm7, xmm3
|
||||
|
||||
pavgb xmm5, xmm0
|
||||
|
||||
;decide if or not to use filtered value
|
||||
pand xmm0, xmm7
|
||||
pandn xmm7, xmm5
|
||||
paddusb xmm0, xmm7
|
||||
%endmacro
|
||||
|
||||
%macro UPDATE_FLIMIT 0
|
||||
movdqa xmm2, XMMWORD PTR [rbx]
|
||||
movdqa [rsp], xmm2
|
||||
add rbx, 16
|
||||
%endmacro
|
||||
|
||||
;void vp8_post_proc_down_and_across_mb_row_sse2
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; unsigned char *dst_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; int dst_pixels_per_line,
|
||||
; int rows,
|
||||
; int cols,
|
||||
; int flimit
|
||||
; int *flimits,
|
||||
; int size
|
||||
;)
|
||||
global sym(vp8_post_proc_down_and_across_xmm) PRIVATE
|
||||
sym(vp8_post_proc_down_and_across_xmm):
|
||||
global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE
|
||||
sym(vp8_post_proc_down_and_across_mb_row_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
|
||||
ALIGN_STACK 16, rax
|
||||
; move the global rd onto the stack, since we don't have enough registers
|
||||
; to do PIC addressing
|
||||
movdqa xmm0, [GLOBAL(rd42)]
|
||||
sub rsp, 16
|
||||
movdqa [rsp], xmm0
|
||||
%define RD42 [rsp]
|
||||
%else
|
||||
%define RD42 [GLOBAL(rd42)]
|
||||
%endif
|
||||
|
||||
; put flimit on stack
|
||||
mov rbx, arg(5) ;flimits ptr
|
||||
UPDATE_FLIMIT
|
||||
|
||||
movd xmm2, dword ptr arg(6) ;flimit
|
||||
punpcklwd xmm2, xmm2
|
||||
punpckldq xmm2, xmm2
|
||||
punpcklqdq xmm2, xmm2
|
||||
%define flimit [rsp]
|
||||
|
||||
mov rsi, arg(0) ;src_ptr
|
||||
mov rdi, arg(1) ;dst_ptr
|
||||
|
||||
movsxd rcx, DWORD PTR arg(4) ;rows
|
||||
movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
|
||||
pxor xmm0, xmm0 ; mm0 = 00000000
|
||||
mov rsi, arg(0) ;src_ptr
|
||||
mov rdi, arg(1) ;dst_ptr
|
||||
|
||||
movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line
|
||||
movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock
|
||||
.nextrow:
|
||||
|
||||
xor rdx, rdx ; clear out rdx for use as loop counter
|
||||
xor rdx, rdx ;col
|
||||
.nextcol:
|
||||
movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7
|
||||
punpcklbw xmm3, xmm0 ; mm3 = p0..p3
|
||||
movdqa xmm1, xmm3 ; mm1 = p0..p3
|
||||
psllw xmm3, 2 ;
|
||||
|
||||
movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7
|
||||
punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3
|
||||
paddusw xmm3, xmm5 ; mm3 += mm6
|
||||
|
||||
; thresholding
|
||||
movdqa xmm7, xmm1 ; mm7 = r0 p0..p3
|
||||
psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3
|
||||
psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3
|
||||
paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
|
||||
pcmpgtw xmm7, xmm2
|
||||
|
||||
movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
|
||||
punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3
|
||||
paddusw xmm3, xmm5 ; mm3 += mm5
|
||||
|
||||
; thresholding
|
||||
movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
|
||||
psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3
|
||||
psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3
|
||||
paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
|
||||
pcmpgtw xmm6, xmm2
|
||||
por xmm7, xmm6 ; accumulate thresholds
|
||||
;load current and next 2 rows
|
||||
movdqu xmm0, XMMWORD PTR [rsi]
|
||||
movdqu xmm1, XMMWORD PTR [rsi + rax]
|
||||
movdqu xmm3, XMMWORD PTR [rsi + 2*rax]
|
||||
|
||||
FIRST_2_ROWS
|
||||
|
||||
;load above 2 rows
|
||||
neg rax
|
||||
movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7
|
||||
punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3
|
||||
paddusw xmm3, xmm5 ; mm3 += mm5
|
||||
movdqu xmm1, XMMWORD PTR [rsi + 2*rax]
|
||||
movdqu xmm3, XMMWORD PTR [rsi + rax]
|
||||
|
||||
; thresholding
|
||||
movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
|
||||
psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3
|
||||
psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3
|
||||
paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
|
||||
pcmpgtw xmm6, xmm2
|
||||
por xmm7, xmm6 ; accumulate thresholds
|
||||
SECOND_2_ROWS
|
||||
|
||||
movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7
|
||||
punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3
|
||||
paddusw xmm3, xmm4 ; mm3 += mm5
|
||||
movdqu XMMWORD PTR [rdi], xmm0
|
||||
|
||||
; thresholding
|
||||
movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
|
||||
psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3
|
||||
psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3
|
||||
paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
|
||||
pcmpgtw xmm6, xmm2
|
||||
por xmm7, xmm6 ; accumulate thresholds
|
||||
neg rax ; positive stride
|
||||
add rsi, 16
|
||||
add rdi, 16
|
||||
|
||||
UPDATE_FLIMIT
|
||||
|
||||
paddusw xmm3, RD42 ; mm3 += round value
|
||||
psraw xmm3, 3 ; mm3 /= 8
|
||||
|
||||
pand xmm1, xmm7 ; mm1 select vals > thresh from source
|
||||
pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
|
||||
paddusw xmm1, xmm7 ; combination
|
||||
|
||||
packuswb xmm1, xmm0 ; pack to bytes
|
||||
movq QWORD PTR [rdi], xmm1 ;
|
||||
|
||||
neg rax ; pitch is positive
|
||||
add rsi, 8
|
||||
add rdi, 8
|
||||
|
||||
add rdx, 8
|
||||
cmp edx, dword arg(5) ;cols
|
||||
|
||||
add rdx, 16
|
||||
cmp edx, dword arg(4) ;cols
|
||||
jl .nextcol
|
||||
|
||||
; done with the all cols, start the across filtering in place
|
||||
sub rsi, rdx
|
||||
sub rdi, rdx
|
||||
|
||||
mov rbx, arg(5) ; flimits
|
||||
UPDATE_FLIMIT
|
||||
|
||||
; dup the first byte into the left border 8 times
|
||||
movq mm1, [rdi]
|
||||
punpcklbw mm1, mm1
|
||||
punpcklwd mm1, mm1
|
||||
punpckldq mm1, mm1
|
||||
|
||||
mov rdx, -8
|
||||
movq [rdi+rdx], mm1
|
||||
|
||||
; dup the last byte into the right border
|
||||
movsxd rdx, dword arg(5)
|
||||
movsxd rdx, dword arg(4)
|
||||
movq mm1, [rdi + rdx + -1]
|
||||
punpcklbw mm1, mm1
|
||||
punpcklwd mm1, mm1
|
||||
@ -158,113 +170,63 @@ sym(vp8_post_proc_down_and_across_xmm):
|
||||
movq [rdi+rdx], mm1
|
||||
|
||||
xor rdx, rdx
|
||||
movq mm0, QWORD PTR [rdi-8];
|
||||
movq mm0, QWORD PTR [rdi-16];
|
||||
movq mm1, QWORD PTR [rdi-8];
|
||||
|
||||
.acrossnextcol:
|
||||
movq xmm7, QWORD PTR [rdi +rdx -2]
|
||||
movd xmm4, DWORD PTR [rdi +rdx +6]
|
||||
movdqu xmm0, XMMWORD PTR [rdi + rdx]
|
||||
movdqu xmm1, XMMWORD PTR [rdi + rdx -2]
|
||||
movdqu xmm3, XMMWORD PTR [rdi + rdx -1]
|
||||
|
||||
pslldq xmm4, 8
|
||||
por xmm4, xmm7
|
||||
FIRST_2_ROWS
|
||||
|
||||
movdqa xmm3, xmm4
|
||||
psrldq xmm3, 2
|
||||
punpcklbw xmm3, xmm0 ; mm3 = p0..p3
|
||||
movdqa xmm1, xmm3 ; mm1 = p0..p3
|
||||
psllw xmm3, 2
|
||||
movdqu xmm1, XMMWORD PTR [rdi + rdx +1]
|
||||
movdqu xmm3, XMMWORD PTR [rdi + rdx +2]
|
||||
|
||||
SECOND_2_ROWS
|
||||
|
||||
movdqa xmm5, xmm4
|
||||
psrldq xmm5, 3
|
||||
punpcklbw xmm5, xmm0 ; mm5 = p1..p4
|
||||
paddusw xmm3, xmm5 ; mm3 += mm6
|
||||
movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes
|
||||
movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes
|
||||
movdq2q mm0, xmm0
|
||||
psrldq xmm0, 8
|
||||
movdq2q mm1, xmm0
|
||||
|
||||
; thresholding
|
||||
movdqa xmm7, xmm1 ; mm7 = p0..p3
|
||||
psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4
|
||||
psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
|
||||
paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4)
|
||||
pcmpgtw xmm7, xmm2
|
||||
UPDATE_FLIMIT
|
||||
|
||||
movdqa xmm5, xmm4
|
||||
psrldq xmm5, 4
|
||||
punpcklbw xmm5, xmm0 ; mm5 = p2..p5
|
||||
paddusw xmm3, xmm5 ; mm3 += mm5
|
||||
|
||||
; thresholding
|
||||
movdqa xmm6, xmm1 ; mm6 = p0..p3
|
||||
psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
|
||||
psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
|
||||
paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
|
||||
pcmpgtw xmm6, xmm2
|
||||
por xmm7, xmm6 ; accumulate thresholds
|
||||
|
||||
|
||||
movdqa xmm5, xmm4 ; mm5 = p-2..p5
|
||||
punpcklbw xmm5, xmm0 ; mm5 = p-2..p1
|
||||
paddusw xmm3, xmm5 ; mm3 += mm5
|
||||
|
||||
; thresholding
|
||||
movdqa xmm6, xmm1 ; mm6 = p0..p3
|
||||
psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
|
||||
psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
|
||||
paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
|
||||
pcmpgtw xmm6, xmm2
|
||||
por xmm7, xmm6 ; accumulate thresholds
|
||||
|
||||
psrldq xmm4, 1 ; mm4 = p-1..p5
|
||||
punpcklbw xmm4, xmm0 ; mm4 = p-1..p2
|
||||
paddusw xmm3, xmm4 ; mm3 += mm5
|
||||
|
||||
; thresholding
|
||||
movdqa xmm6, xmm1 ; mm6 = p0..p3
|
||||
psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4
|
||||
psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3
|
||||
paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4)
|
||||
pcmpgtw xmm6, xmm2
|
||||
por xmm7, xmm6 ; accumulate thresholds
|
||||
|
||||
paddusw xmm3, RD42 ; mm3 += round value
|
||||
psraw xmm3, 3 ; mm3 /= 8
|
||||
|
||||
pand xmm1, xmm7 ; mm1 select vals > thresh from source
|
||||
pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
|
||||
paddusw xmm1, xmm7 ; combination
|
||||
|
||||
packuswb xmm1, xmm0 ; pack to bytes
|
||||
movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes
|
||||
movdq2q mm0, xmm1
|
||||
|
||||
add rdx, 8
|
||||
cmp edx, dword arg(5) ;cols
|
||||
add rdx, 16
|
||||
cmp edx, dword arg(4) ;cols
|
||||
jl .acrossnextcol;
|
||||
|
||||
; last 8 pixels
|
||||
movq QWORD PTR [rdi+rdx-8], mm0
|
||||
; last 16 pixels
|
||||
movq QWORD PTR [rdi+rdx-16], mm0
|
||||
|
||||
cmp edx, dword arg(4)
|
||||
jne .throw_last_8
|
||||
movq QWORD PTR [rdi+rdx-8], mm1
|
||||
.throw_last_8:
|
||||
; done with this rwo
|
||||
add rsi,rax ; next line
|
||||
mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
|
||||
add rdi,rax ; next destination
|
||||
mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
|
||||
add rsi,rax ;next src line
|
||||
mov eax, dword arg(3) ;dst_pixels_per_line
|
||||
add rdi,rax ;next destination
|
||||
mov eax, dword arg(2) ;src_pixels_per_line
|
||||
|
||||
dec rcx ; decrement count
|
||||
jnz .nextrow ; next row
|
||||
mov rbx, arg(5) ;flimits
|
||||
UPDATE_FLIMIT
|
||||
|
||||
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
|
||||
add rsp,16
|
||||
dec rcx ;decrement count
|
||||
jnz .nextrow ;next row
|
||||
|
||||
add rsp, 16
|
||||
pop rsp
|
||||
%endif
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
pop rbx
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
%undef RD42
|
||||
|
||||
%undef flimit
|
||||
|
||||
;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
|
||||
; int pitch, int rows, int cols,int flimit)
|
||||
@ -753,7 +715,5 @@ sym(vp8_plane_add_noise_wmt):
|
||||
|
||||
SECTION_RODATA
|
||||
align 16
|
||||
rd42:
|
||||
times 8 dw 0x04
|
||||
four8s:
|
||||
times 4 dd 8
|
||||
|
@ -5301,7 +5301,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
|
||||
double frame_psnr2, frame_ssim2 = 0;
|
||||
double weight = 0;
|
||||
|
||||
vp8_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0);
|
||||
vp8_deblock(cm, cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0);
|
||||
vp8_clear_system_state();
|
||||
|
||||
ye = calc_plane_error(orig->y_buffer, orig->y_stride,
|
||||
|
Loading…
Reference in New Issue
Block a user