post-proc: deblock filter optimization

1. Algorithm modification:
Instead of having same filter threshold for a whole frame, now we
allow the thresholds to be adjusted for each macroblock. In current
implementation, to avoid excessive blur on background as reported
in issue480(http://code.google.com/p/webm/issues/detail?id=480), we
reduce the thresholds for skipped macroblocks.

2. SSE2 optimization:
As started in issue479(http://code.google.com/p/webm/issues/detail?id=479),
the filter calculation was adjusted for better performance. The c
code was also modified accordingly. This made the deblock filter
2x faster, and the decoder was 1.2x faster overall.

Next, the demacroblock filter will be modified similarly.

Change-Id: I05e54c3f580ccd427487d085096b3174f2ab7e86
This commit is contained in:
Yunqing Wang 2012-09-28 10:13:07 -07:00
parent 9704cdec9f
commit 4c53bacce4
8 changed files with 287 additions and 558 deletions

View File

@ -19,9 +19,9 @@ typedef void (*post_proc_func_t)(unsigned char *src_ptr,
unsigned char *dst_ptr,
int src_pixels_per_line,
int dst_pixels_per_line,
int rows,
int cols,
int flimit);
unsigned char *flimit,
int size);
namespace {
@ -29,7 +29,7 @@ class Vp8PostProcessingFilterTest
: public ::testing::TestWithParam<post_proc_func_t> {};
// Test routine for the VP8 post-processing function
// vp8_post_proc_down_and_across_c.
// vp8_post_proc_down_and_across_mb_row_c.
TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
// Size of the underlying data block that will be filtered.
@ -56,6 +56,8 @@ TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
// Pointers to top-left pixel of block in the input and output images.
uint8_t *const src_image_ptr = src_image + (input_stride << 1);
uint8_t *const dst_image_ptr = dst_image + 8;
uint8_t *const flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
(void)vpx_memset(flimits, 255, block_width);
// Initialize pixels in the input:
// block pixels to value 1,
@ -73,14 +75,13 @@ TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
(void)vpx_memset(dst_image, 99, output_size);
GetParam()(src_image_ptr, dst_image_ptr, input_stride,
output_stride, block_height, block_width,
255);
output_stride, block_width, flimits, 16);
static const uint8_t expected_data[block_height] = {
3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3
4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4
};
pixel_ptr = dst_image;
pixel_ptr = dst_image_ptr;
for (int i = 0; i < block_height; ++i) {
for (int j = 0; j < block_width; ++j) {
EXPECT_EQ(expected_data[i], pixel_ptr[j])
@ -91,19 +92,15 @@ TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
vpx_free(src_image);
vpx_free(dst_image);
vpx_free(flimits);
};
INSTANTIATE_TEST_CASE_P(C, Vp8PostProcessingFilterTest,
::testing::Values(vp8_post_proc_down_and_across_c));
#if HAVE_MMX
INSTANTIATE_TEST_CASE_P(MMX, Vp8PostProcessingFilterTest,
::testing::Values(vp8_post_proc_down_and_across_mmx));
#endif
::testing::Values(vp8_post_proc_down_and_across_mb_row_c));
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(SSE2, Vp8PostProcessingFilterTest,
::testing::Values(vp8_post_proc_down_and_across_xmm));
::testing::Values(vp8_post_proc_down_and_across_mb_row_sse2));
#endif
} // namespace

View File

@ -127,25 +127,24 @@ extern void vp8_blit_text(const char *msg, unsigned char *address, const int pit
extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch);
/***********************************************************************************************************
*/
void vp8_post_proc_down_and_across_c
void vp8_post_proc_down_and_across_mb_row_c
(
unsigned char *src_ptr,
unsigned char *dst_ptr,
int src_pixels_per_line,
int dst_pixels_per_line,
int rows,
int cols,
int flimit
unsigned char *f,
int size
)
{
unsigned char *p_src, *p_dst;
int row;
int col;
int i;
int v;
unsigned char d[8];
unsigned char v;
unsigned char d[4];
for (row = 0; row < rows; row++)
for (row = 0; row < size; row++)
{
/* post_proc_down for one row */
p_src = src_ptr;
@ -153,20 +152,23 @@ void vp8_post_proc_down_and_across_c
for (col = 0; col < cols; col++)
{
unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
unsigned char p_above1 = p_src[col - src_pixels_per_line];
unsigned char p_below1 = p_src[col + src_pixels_per_line];
unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
int kernel = 4;
int v = p_src[col];
v = p_src[col];
for (i = -2; i <= 2; i++)
if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col])
&& (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col]))
{
if (abs(v - p_src[col+i*src_pixels_per_line]) > flimit)
goto down_skip_convolve;
kernel += kernel5[2+i] * p_src[col+i*src_pixels_per_line];
unsigned char k1, k2, k3;
k1 = (p_above2 + p_above1 + 1) >> 1;
k2 = (p_below2 + p_below1 + 1) >> 1;
k3 = (k1 + k2 + 1) >> 1;
v = (k3 + v + 1) >> 1;
}
v = (kernel >> 3);
down_skip_convolve:
p_dst[col] = v;
}
@ -174,40 +176,34 @@ void vp8_post_proc_down_and_across_c
p_src = dst_ptr;
p_dst = dst_ptr;
for (i = -8; i<0; i++)
p_src[i]=p_src[0];
for (i = cols; i<cols+8; i++)
p_src[i]=p_src[cols-1];
for (i = 0; i < 8; i++)
d[i] = p_src[i];
p_src[-2] = p_src[-1] = p_src[0];
p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
for (col = 0; col < cols; col++)
{
int kernel = 4;
v = p_src[col];
d[col&7] = v;
for (i = -2; i <= 2; i++)
if ((abs(v - p_src[col - 2]) < f[col])
&& (abs(v - p_src[col - 1]) < f[col])
&& (abs(v - p_src[col + 1]) < f[col])
&& (abs(v - p_src[col + 2]) < f[col]))
{
if (abs(v - p_src[col+i]) > flimit)
goto across_skip_convolve;
kernel += kernel5[2+i] * p_src[col+i];
unsigned char k1, k2, k3;
k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
k3 = (k1 + k2 + 1) >> 1;
v = (k3 + v + 1) >> 1;
}
d[col&7] = (kernel >> 3);
across_skip_convolve:
d[col & 3] = v;
if (col >= 2)
p_dst[col-2] = d[(col-2)&7];
p_dst[col - 2] = d[(col - 2) & 3];
}
/* handle the last two pixels */
p_dst[col-2] = d[(col-2)&7];
p_dst[col-1] = d[(col-1)&7];
p_dst[col - 2] = d[(col - 2) & 3];
p_dst[col - 1] = d[(col - 1) & 3];
/* next row */
src_ptr += src_pixels_per_line;
@ -318,28 +314,17 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, i
}
}
static void vp8_deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source,
YV12_BUFFER_CONFIG *post,
int q,
int low_var_thresh,
int flag)
static void vp8_de_mblock(YV12_BUFFER_CONFIG *post,
int q)
{
double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
int ppl = (int)(level + .5);
(void) low_var_thresh;
(void) flag;
vp8_post_proc_down_and_across(source->y_buffer, post->y_buffer, source->y_stride, post->y_stride, source->y_height, source->y_width, ppl);
vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
vp8_post_proc_down_and_across(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
vp8_post_proc_down_and_across(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
post->y_width, q2mbl(q));
vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
post->y_width, q2mbl(q));
}
void vp8_deblock(YV12_BUFFER_CONFIG *source,
void vp8_deblock(VP8_COMMON *cm,
YV12_BUFFER_CONFIG *source,
YV12_BUFFER_CONFIG *post,
int q,
int low_var_thresh,
@ -347,12 +332,58 @@ void vp8_deblock(YV12_BUFFER_CONFIG *source,
{
double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
int ppl = (int)(level + .5);
const MODE_INFO *mode_info_context = cm->mi;
int mbr, mbc;
/* The pixel thresholds are adjusted according to if or not the macroblock
* is a skipped block. */
unsigned char *ylimits = (unsigned char *)vpx_memalign(16, 16 * cm->mb_cols);
unsigned char *uvlimits = (unsigned char *)vpx_memalign(16, 8 * cm->mb_cols);
(void) low_var_thresh;
(void) flag;
vp8_post_proc_down_and_across(source->y_buffer, post->y_buffer, source->y_stride, post->y_stride, source->y_height, source->y_width, ppl);
vp8_post_proc_down_and_across(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
vp8_post_proc_down_and_across(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
if (ppl > 0)
{
for (mbr = 0; mbr < cm->mb_rows; mbr++)
{
unsigned char *ylptr = ylimits;
unsigned char *uvlptr = uvlimits;
for (mbc = 0; mbc < cm->mb_cols; mbc++)
{
unsigned char mb_ppl;
if (mode_info_context->mbmi.mb_skip_coeff)
mb_ppl = (unsigned char)ppl >> 1;
else
mb_ppl = (unsigned char)ppl;
vpx_memset(ylptr, mb_ppl, 16);
vpx_memset(uvlptr, mb_ppl, 8);
ylptr += 16;
uvlptr += 8;
mode_info_context++;
}
mode_info_context++;
vp8_post_proc_down_and_across_mb_row(
source->y_buffer + 16 * mbr * source->y_stride,
post->y_buffer + 16 * mbr * post->y_stride, source->y_stride,
post->y_stride, source->y_width, ylimits, 16);
vp8_post_proc_down_and_across_mb_row(
source->u_buffer + 8 * mbr * source->uv_stride,
post->u_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
post->uv_stride, source->uv_width, uvlimits, 8);
vp8_post_proc_down_and_across_mb_row(
source->v_buffer + 8 * mbr * source->uv_stride,
post->v_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
post->uv_stride, source->uv_width, uvlimits, 8);
}
}
vpx_free(ylimits);
vpx_free(uvlimits);
}
#if !(CONFIG_TEMPORAL_DENOISING)
@ -364,33 +395,35 @@ void vp8_de_noise(YV12_BUFFER_CONFIG *source,
{
double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
int ppl = (int)(level + .5);
int mb_rows = source->y_width >> 4;
int mb_cols = source->y_height >> 4;
unsigned char *limits = (unsigned char *)vpx_memalign(16, 16 * mb_cols);
int mbr, mbc;
(void) post;
(void) low_var_thresh;
(void) flag;
vp8_post_proc_down_and_across(
source->y_buffer + 2 * source->y_stride + 2,
source->y_buffer + 2 * source->y_stride + 2,
source->y_stride,
source->y_stride,
source->y_height - 4,
source->y_width - 4,
ppl);
vp8_post_proc_down_and_across(
source->u_buffer + 2 * source->uv_stride + 2,
source->u_buffer + 2 * source->uv_stride + 2,
source->uv_stride,
source->uv_stride,
source->uv_height - 4,
source->uv_width - 4, ppl);
vp8_post_proc_down_and_across(
source->v_buffer + 2 * source->uv_stride + 2,
source->v_buffer + 2 * source->uv_stride + 2,
source->uv_stride,
source->uv_stride,
source->uv_height - 4,
source->uv_width - 4, ppl);
/* TODO: The original code don't filter the 2 outer rows and columns. */
vpx_memset(limits, (unsigned char)ppl, 16 * mb_cols);
for (mbr = 0; mbr < mb_rows; mbr++)
{
vp8_post_proc_down_and_across_mb_row(
source->y_buffer + 16 * mbr * source->y_stride,
source->y_buffer + 16 * mbr * source->y_stride,
source->y_stride, source->y_stride, source->y_width, limits, 16);
vp8_post_proc_down_and_across_mb_row(
source->u_buffer + 8 * mbr * source->uv_stride,
source->u_buffer + 8 * mbr * source->uv_stride,
source->uv_stride, source->uv_stride, source->uv_width, limits, 8);
vp8_post_proc_down_and_across_mb_row(
source->v_buffer + 8 * mbr * source->uv_stride,
source->v_buffer + 8 * mbr * source->uv_stride,
source->uv_stride, source->uv_stride, source->uv_width, limits, 8);
}
vpx_free(limits);
}
#endif
@ -752,12 +785,14 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
vp8_yv12_copy_frame(&oci->post_proc_buffer, &oci->post_proc_buffer_int);
if (flags & VP8D_DEMACROBLOCK)
{
vp8_deblock_and_de_macro_block(&oci->post_proc_buffer_int, &oci->post_proc_buffer,
vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer,
q + (deblock_level - 5) * 10, 1, 0);
vp8_de_mblock(&oci->post_proc_buffer,
q + (deblock_level - 5) * 10);
}
else if (flags & VP8D_DEBLOCK)
{
vp8_deblock(&oci->post_proc_buffer_int, &oci->post_proc_buffer,
vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer,
q, 1, 0);
}
}
@ -766,13 +801,15 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
}
else if (flags & VP8D_DEMACROBLOCK)
{
vp8_deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer,
q + (deblock_level - 5) * 10, 1, 0);
vp8_de_mblock(&oci->post_proc_buffer, q + (deblock_level - 5) * 10);
oci->postproc_state.last_base_qindex = oci->base_qindex;
}
else if (flags & VP8D_DEBLOCK)
{
vp8_deblock(oci->frame_to_show, &oci->post_proc_buffer,
vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer,
q, 1, 0);
oci->postproc_state.last_base_qindex = oci->base_qindex;
}

View File

@ -36,7 +36,8 @@ void vp8_de_noise(YV12_BUFFER_CONFIG *source,
int low_var_thresh,
int flag);
void vp8_deblock(YV12_BUFFER_CONFIG *source,
void vp8_deblock(struct VP8Common *oci,
YV12_BUFFER_CONFIG *source,
YV12_BUFFER_CONFIG *post,
int q,
int low_var_thresh,

View File

@ -19,14 +19,14 @@ void (*vp8_short_idct4x4)(short *input, short *output, int pitch);
void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);
void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);
extern void (*vp8_post_proc_down_and_across)(
extern void (*vp8_post_proc_down_and_across_mb_row)(
unsigned char *src_ptr,
unsigned char *dst_ptr,
int src_pixels_per_line,
int dst_pixels_per_line,
int rows,
int cols,
int flimit
unsigned char *f,
int size
);
extern void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);
@ -34,15 +34,15 @@ extern void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int
extern void (*vp8_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);
extern void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);
extern void vp8_post_proc_down_and_across_c
extern void vp8_post_proc_down_and_across_mb_row_c
(
unsigned char *src_ptr,
unsigned char *dst_ptr,
int src_pixels_per_line,
int dst_pixels_per_line,
int rows,
int cols,
int flimit
unsigned char *f,
int size
);
void vp8_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);
@ -158,7 +158,7 @@ void vp8_machine_specific_config(void)
vp8_lf_mbhsimple = loop_filter_mbhs_ppc;
vp8_lf_bhsimple = loop_filter_bhs_ppc;
vp8_post_proc_down_and_across = vp8_post_proc_down_and_across_c;
vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c;
vp8_mbpost_proc_down = vp8_mbpost_proc_down_c;
vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_c;
vp8_plane_add_noise = vp8_plane_add_noise_c;

View File

@ -162,9 +162,8 @@ if [ "$CONFIG_POSTPROC" = "yes" ]; then
specialize vp8_mbpost_proc_across_ip sse2
vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm
prototype void vp8_post_proc_down_and_across "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int rows, int cols, int flimit"
specialize vp8_post_proc_down_and_across mmx sse2
vp8_post_proc_down_and_across_sse2=vp8_post_proc_down_and_across_xmm
prototype void vp8_post_proc_down_and_across_mb_row "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size"
specialize vp8_post_proc_down_and_across_mb_row sse2
prototype void vp8_plane_add_noise "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch"
specialize vp8_plane_add_noise mmx sse2

View File

@ -14,271 +14,6 @@
%define VP8_FILTER_WEIGHT 128
%define VP8_FILTER_SHIFT 7
;void vp8_post_proc_down_and_across_mmx
;(
; unsigned char *src_ptr,
; unsigned char *dst_ptr,
; int src_pixels_per_line,
; int dst_pixels_per_line,
; int rows,
; int cols,
; int flimit
;)
global sym(vp8_post_proc_down_and_across_mmx) PRIVATE
sym(vp8_post_proc_down_and_across_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
GET_GOT rbx
push rsi
push rdi
; end prolog
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
; move the global rd onto the stack, since we don't have enough registers
; to do PIC addressing
movq mm0, [GLOBAL(rd)]
sub rsp, 8
movq [rsp], mm0
%define RD [rsp]
%else
%define RD [GLOBAL(rd)]
%endif
push rbx
lea rbx, [GLOBAL(Blur)]
movd mm2, dword ptr arg(6) ;flimit
punpcklwd mm2, mm2
punpckldq mm2, mm2
mov rsi, arg(0) ;src_ptr
mov rdi, arg(1) ;dst_ptr
movsxd rcx, DWORD PTR arg(4) ;rows
movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
pxor mm0, mm0 ; mm0 = 00000000
.nextrow:
xor rdx, rdx ; clear out rdx for use as loop counter
.nextcol:
pxor mm7, mm7 ; mm7 = 00000000
movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps
movq mm3, [rsi] ; mm4 = r0 p0..p7
punpcklbw mm3, mm0 ; mm3 = p0..p3
movq mm1, mm3 ; mm1 = p0..p3
pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
movq mm6, [rbx + 48] ; mm6 = kernel 3 taps
movq mm5, [rsi + rax] ; mm4 = r1 p0..p7
punpcklbw mm5, mm0 ; mm5 = r1 p0..p3
pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers
paddusw mm3, mm6 ; mm3 += mm6
; thresholding
movq mm7, mm1 ; mm7 = r0 p0..p3
psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3
psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3
paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
pcmpgtw mm7, mm2
movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers
movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7
punpcklbw mm5, mm0 ; mm5 = r2 p0..p3
pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
paddusw mm3, mm6 ; mm3 += mm5
; thresholding
movq mm6, mm1 ; mm6 = r0 p0..p3
psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3
psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3
paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
pcmpgtw mm6, mm2
por mm7, mm6 ; accumulate thresholds
neg rax
movq mm6, [rbx ] ; kernel 0 taps
movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7
punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3
pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
paddusw mm3, mm6 ; mm3 += mm5
; thresholding
movq mm6, mm1 ; mm6 = r0 p0..p3
psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3
psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3
paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
pcmpgtw mm6, mm2
por mm7, mm6 ; accumulate thresholds
movq mm6, [rbx + 16] ; kernel 1 taps
movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7
punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3
pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
paddusw mm3, mm6 ; mm3 += mm5
; thresholding
movq mm6, mm1 ; mm6 = r0 p0..p3
psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3
psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3
paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
pcmpgtw mm6, mm2
por mm7, mm6 ; accumulate thresholds
paddusw mm3, RD ; mm3 += round value
psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
pand mm1, mm7 ; mm1 select vals > thresh from source
pandn mm7, mm3 ; mm7 select vals < thresh from blurred result
paddusw mm1, mm7 ; combination
packuswb mm1, mm0 ; pack to bytes
movd [rdi], mm1 ;
neg rax ; pitch is positive
add rsi, 4
add rdi, 4
add rdx, 4
cmp edx, dword ptr arg(5) ;cols
jl .nextcol
; done with the all cols, start the across filtering in place
sub rsi, rdx
sub rdi, rdx
; dup the first byte into the left border 8 times
movq mm1, [rdi]
punpcklbw mm1, mm1
punpcklwd mm1, mm1
punpckldq mm1, mm1
mov rdx, -8
movq [rdi+rdx], mm1
; dup the last byte into the right border
movsxd rdx, dword arg(5)
movq mm1, [rdi + rdx + -1]
punpcklbw mm1, mm1
punpcklwd mm1, mm1
punpckldq mm1, mm1
movq [rdi+rdx], mm1
push rax
xor rdx, rdx
mov rax, [rdi-4];
.acrossnextcol:
pxor mm7, mm7 ; mm7 = 00000000
movq mm6, [rbx + 32 ] ;
movq mm4, [rdi+rdx] ; mm4 = p0..p7
movq mm3, mm4 ; mm3 = p0..p7
punpcklbw mm3, mm0 ; mm3 = p0..p3
movq mm1, mm3 ; mm1 = p0..p3
pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
movq mm6, [rbx + 48]
psrlq mm4, 8 ; mm4 = p1..p7
movq mm5, mm4 ; mm5 = p1..p7
punpcklbw mm5, mm0 ; mm5 = p1..p4
pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers
paddusw mm3, mm6 ; mm3 += mm6
; thresholding
movq mm7, mm1 ; mm7 = p0..p3
psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4
psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4)
pcmpgtw mm7, mm2
movq mm6, [rbx + 64 ]
psrlq mm4, 8 ; mm4 = p2..p7
movq mm5, mm4 ; mm5 = p2..p7
punpcklbw mm5, mm0 ; mm5 = p2..p5
pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
paddusw mm3, mm6 ; mm3 += mm5
; thresholding
movq mm6, mm1 ; mm6 = p0..p3
psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4
psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)
pcmpgtw mm6, mm2
por mm7, mm6 ; accumulate thresholds
movq mm6, [rbx ]
movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5
movq mm5, mm4 ; mm5 = p-2..p5
punpcklbw mm5, mm0 ; mm5 = p-2..p1
pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
paddusw mm3, mm6 ; mm3 += mm5
; thresholding
movq mm6, mm1 ; mm6 = p0..p3
psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4
psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)
pcmpgtw mm6, mm2
por mm7, mm6 ; accumulate thresholds
movq mm6, [rbx + 16]
psrlq mm4, 8 ; mm4 = p-1..p5
punpcklbw mm4, mm0 ; mm4 = p-1..p2
pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
paddusw mm3, mm6 ; mm3 += mm5
; thresholding
movq mm6, mm1 ; mm6 = p0..p3
psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4
psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3
paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4)
pcmpgtw mm6, mm2
por mm7, mm6 ; accumulate thresholds
paddusw mm3, RD ; mm3 += round value
psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
pand mm1, mm7 ; mm1 select vals > thresh from source
pandn mm7, mm3 ; mm7 select vals < thresh from blurred result
paddusw mm1, mm7 ; combination
packuswb mm1, mm0 ; pack to bytes
mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes
movd eax, mm1
add rdx, 4
cmp edx, dword ptr arg(5) ;cols
jl .acrossnextcol;
mov DWORD PTR [rdi+rdx-4], eax
pop rax
; done with this rwo
add rsi,rax ; next line
movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
add rdi,rax ; next destination
movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
dec rcx ; decrement count
jnz .nextrow ; next row
pop rbx
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
%undef RD
;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
; int pitch, int rows, int cols,int flimit)
extern sym(vp8_rv)

View File

@ -11,146 +11,158 @@
%include "vpx_ports/x86_abi_support.asm"
;void vp8_post_proc_down_and_across_xmm
;macro in deblock functions
%macro FIRST_2_ROWS 0
movdqa xmm4, xmm0
movdqa xmm6, xmm0
movdqa xmm5, xmm1
pavgb xmm5, xmm3
;calculate absolute value
psubusb xmm4, xmm1
psubusb xmm1, xmm0
psubusb xmm6, xmm3
psubusb xmm3, xmm0
paddusb xmm4, xmm1
paddusb xmm6, xmm3
;get threshold
movdqa xmm2, flimit
pxor xmm1, xmm1
movdqa xmm7, xmm2
;get mask
psubusb xmm2, xmm4
psubusb xmm7, xmm6
pcmpeqb xmm2, xmm1
pcmpeqb xmm7, xmm1
por xmm7, xmm2
%endmacro
%macro SECOND_2_ROWS 0
movdqa xmm6, xmm0
movdqa xmm4, xmm0
movdqa xmm2, xmm1
pavgb xmm1, xmm3
;calculate absolute value
psubusb xmm6, xmm2
psubusb xmm2, xmm0
psubusb xmm4, xmm3
psubusb xmm3, xmm0
paddusb xmm6, xmm2
paddusb xmm4, xmm3
pavgb xmm5, xmm1
;get threshold
movdqa xmm2, flimit
pxor xmm1, xmm1
movdqa xmm3, xmm2
;get mask
psubusb xmm2, xmm6
psubusb xmm3, xmm4
pcmpeqb xmm2, xmm1
pcmpeqb xmm3, xmm1
por xmm7, xmm2
por xmm7, xmm3
pavgb xmm5, xmm0
;decide if or not to use filtered value
pand xmm0, xmm7
pandn xmm7, xmm5
paddusb xmm0, xmm7
%endmacro
%macro UPDATE_FLIMIT 0
movdqa xmm2, XMMWORD PTR [rbx]
movdqa [rsp], xmm2
add rbx, 16
%endmacro
;void vp8_post_proc_down_and_across_mb_row_sse2
;(
; unsigned char *src_ptr,
; unsigned char *dst_ptr,
; int src_pixels_per_line,
; int dst_pixels_per_line,
; int rows,
; int cols,
; int flimit
; int *flimits,
; int size
;)
global sym(vp8_post_proc_down_and_across_xmm) PRIVATE
sym(vp8_post_proc_down_and_across_xmm):
global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE
sym(vp8_post_proc_down_and_across_mb_row_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
GET_GOT rbx
push rbx
push rsi
push rdi
; end prolog
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
ALIGN_STACK 16, rax
; move the global rd onto the stack, since we don't have enough registers
; to do PIC addressing
movdqa xmm0, [GLOBAL(rd42)]
sub rsp, 16
movdqa [rsp], xmm0
%define RD42 [rsp]
%else
%define RD42 [GLOBAL(rd42)]
%endif
; put flimit on stack
mov rbx, arg(5) ;flimits ptr
UPDATE_FLIMIT
movd xmm2, dword ptr arg(6) ;flimit
punpcklwd xmm2, xmm2
punpckldq xmm2, xmm2
punpcklqdq xmm2, xmm2
%define flimit [rsp]
mov rsi, arg(0) ;src_ptr
mov rdi, arg(1) ;dst_ptr
movsxd rcx, DWORD PTR arg(4) ;rows
movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
pxor xmm0, xmm0 ; mm0 = 00000000
movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line
movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock
.nextrow:
xor rdx, rdx ; clear out rdx for use as loop counter
xor rdx, rdx ;col
.nextcol:
movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7
punpcklbw xmm3, xmm0 ; mm3 = p0..p3
movdqa xmm1, xmm3 ; mm1 = p0..p3
psllw xmm3, 2 ;
movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7
punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3
paddusw xmm3, xmm5 ; mm3 += mm6
; thresholding
movdqa xmm7, xmm1 ; mm7 = r0 p0..p3
psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3
psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3
paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
pcmpgtw xmm7, xmm2
movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3
paddusw xmm3, xmm5 ; mm3 += mm5
; thresholding
movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3
psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3
paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
pcmpgtw xmm6, xmm2
por xmm7, xmm6 ; accumulate thresholds
;load current and next 2 rows
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi + rax]
movdqu xmm3, XMMWORD PTR [rsi + 2*rax]
FIRST_2_ROWS
;load above 2 rows
neg rax
movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7
punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3
paddusw xmm3, xmm5 ; mm3 += mm5
movdqu xmm1, XMMWORD PTR [rsi + 2*rax]
movdqu xmm3, XMMWORD PTR [rsi + rax]
; thresholding
movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3
psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3
paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
pcmpgtw xmm6, xmm2
por xmm7, xmm6 ; accumulate thresholds
SECOND_2_ROWS
movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7
punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3
paddusw xmm3, xmm4 ; mm3 += mm5
movdqu XMMWORD PTR [rdi], xmm0
; thresholding
movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3
psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3
paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
pcmpgtw xmm6, xmm2
por xmm7, xmm6 ; accumulate thresholds
neg rax ; positive stride
add rsi, 16
add rdi, 16
UPDATE_FLIMIT
paddusw xmm3, RD42 ; mm3 += round value
psraw xmm3, 3 ; mm3 /= 8
pand xmm1, xmm7 ; mm1 select vals > thresh from source
pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
paddusw xmm1, xmm7 ; combination
packuswb xmm1, xmm0 ; pack to bytes
movq QWORD PTR [rdi], xmm1 ;
neg rax ; pitch is positive
add rsi, 8
add rdi, 8
add rdx, 8
cmp edx, dword arg(5) ;cols
add rdx, 16
cmp edx, dword arg(4) ;cols
jl .nextcol
; done with the all cols, start the across filtering in place
sub rsi, rdx
sub rdi, rdx
mov rbx, arg(5) ; flimits
UPDATE_FLIMIT
; dup the first byte into the left border 8 times
movq mm1, [rdi]
punpcklbw mm1, mm1
punpcklwd mm1, mm1
punpckldq mm1, mm1
mov rdx, -8
movq [rdi+rdx], mm1
; dup the last byte into the right border
movsxd rdx, dword arg(5)
movsxd rdx, dword arg(4)
movq mm1, [rdi + rdx + -1]
punpcklbw mm1, mm1
punpcklwd mm1, mm1
@ -158,113 +170,63 @@ sym(vp8_post_proc_down_and_across_xmm):
movq [rdi+rdx], mm1
xor rdx, rdx
movq mm0, QWORD PTR [rdi-8];
movq mm0, QWORD PTR [rdi-16];
movq mm1, QWORD PTR [rdi-8];
.acrossnextcol:
movq xmm7, QWORD PTR [rdi +rdx -2]
movd xmm4, DWORD PTR [rdi +rdx +6]
movdqu xmm0, XMMWORD PTR [rdi + rdx]
movdqu xmm1, XMMWORD PTR [rdi + rdx -2]
movdqu xmm3, XMMWORD PTR [rdi + rdx -1]
pslldq xmm4, 8
por xmm4, xmm7
FIRST_2_ROWS
movdqa xmm3, xmm4
psrldq xmm3, 2
punpcklbw xmm3, xmm0 ; mm3 = p0..p3
movdqa xmm1, xmm3 ; mm1 = p0..p3
psllw xmm3, 2
movdqu xmm1, XMMWORD PTR [rdi + rdx +1]
movdqu xmm3, XMMWORD PTR [rdi + rdx +2]
SECOND_2_ROWS
movdqa xmm5, xmm4
psrldq xmm5, 3
punpcklbw xmm5, xmm0 ; mm5 = p1..p4
paddusw xmm3, xmm5 ; mm3 += mm6
movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes
movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes
movdq2q mm0, xmm0
psrldq xmm0, 8
movdq2q mm1, xmm0
; thresholding
movdqa xmm7, xmm1 ; mm7 = p0..p3
psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4
psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4)
pcmpgtw xmm7, xmm2
UPDATE_FLIMIT
movdqa xmm5, xmm4
psrldq xmm5, 4
punpcklbw xmm5, xmm0 ; mm5 = p2..p5
paddusw xmm3, xmm5 ; mm3 += mm5
; thresholding
movdqa xmm6, xmm1 ; mm6 = p0..p3
psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
pcmpgtw xmm6, xmm2
por xmm7, xmm6 ; accumulate thresholds
movdqa xmm5, xmm4 ; mm5 = p-2..p5
punpcklbw xmm5, xmm0 ; mm5 = p-2..p1
paddusw xmm3, xmm5 ; mm3 += mm5
; thresholding
movdqa xmm6, xmm1 ; mm6 = p0..p3
psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
pcmpgtw xmm6, xmm2
por xmm7, xmm6 ; accumulate thresholds
psrldq xmm4, 1 ; mm4 = p-1..p5
punpcklbw xmm4, xmm0 ; mm4 = p-1..p2
paddusw xmm3, xmm4 ; mm3 += mm5
; thresholding
movdqa xmm6, xmm1 ; mm6 = p0..p3
psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4
psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3
paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4)
pcmpgtw xmm6, xmm2
por xmm7, xmm6 ; accumulate thresholds
paddusw xmm3, RD42 ; mm3 += round value
psraw xmm3, 3 ; mm3 /= 8
pand xmm1, xmm7 ; mm1 select vals > thresh from source
pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
paddusw xmm1, xmm7 ; combination
packuswb xmm1, xmm0 ; pack to bytes
movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes
movdq2q mm0, xmm1
add rdx, 8
cmp edx, dword arg(5) ;cols
add rdx, 16
cmp edx, dword arg(4) ;cols
jl .acrossnextcol;
; last 8 pixels
movq QWORD PTR [rdi+rdx-8], mm0
; last 16 pixels
movq QWORD PTR [rdi+rdx-16], mm0
cmp edx, dword arg(4)
jne .throw_last_8
movq QWORD PTR [rdi+rdx-8], mm1
.throw_last_8:
; done with this rwo
add rsi,rax ; next line
mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
add rsi,rax ;next src line
mov eax, dword arg(3) ;dst_pixels_per_line
add rdi,rax ;next destination
mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
mov eax, dword arg(2) ;src_pixels_per_line
mov rbx, arg(5) ;flimits
UPDATE_FLIMIT
dec rcx ;decrement count
jnz .nextrow ;next row
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
add rsp, 16
pop rsp
%endif
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
pop rbx
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
%undef RD42
%undef flimit
;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
; int pitch, int rows, int cols,int flimit)
@ -753,7 +715,5 @@ sym(vp8_plane_add_noise_wmt):
SECTION_RODATA
align 16
rd42:
times 8 dw 0x04
four8s:
times 4 dd 8

View File

@ -5301,7 +5301,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
double frame_psnr2, frame_ssim2 = 0;
double weight = 0;
vp8_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0);
vp8_deblock(cm, cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0);
vp8_clear_system_state();
ye = calc_plane_error(orig->y_buffer, orig->y_stride,