From 94ae0430d28c7204af0ee911e90e598542c65fdc Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Wed, 25 Jun 2014 11:28:02 -0700 Subject: [PATCH] vp8: Add temporal denoising for UV-channel. C version and sse2 version, and off by default. For the test clip used, the sse2 performance improved by ~5.6% Change-Id: Ic2d815968849db51b9d62085d7a490d0e01574f6 --- vp8/common/rtcd_defs.pl | 3 + vp8/encoder/denoising.c | 194 ++++++++++++++++++++++++- vp8/encoder/denoising.h | 5 + vp8/encoder/x86/denoising_sse2.c | 241 ++++++++++++++++++++++++++++--- 4 files changed, 416 insertions(+), 27 deletions(-) diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index cbfd76a8d..ba2693968 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -554,6 +554,9 @@ $vp8_yv12_copy_partial_frame_neon_asm=vp8_yv12_copy_partial_frame_neon; if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") { add_proto qw/int vp8_denoiser_filter/, "unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising"; specialize qw/vp8_denoiser_filter sse2 neon/; + add_proto qw/int vp8_denoiser_filter_uv/, "unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising"; + specialize qw/vp8_denoiser_filter_uv sse2/; + } # End of encoder only functions diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c index 0f2e5f17b..9ad411366 100644 --- a/vp8/encoder/denoising.c +++ b/vp8/encoder/denoising.c @@ -191,6 +191,148 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, return FILTER_BLOCK; } +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, + int mc_avg_uv_stride, + unsigned char *running_avg_uv, + int avg_uv_stride, + unsigned char *sig, + int sig_stride, + unsigned int motion_magnitude, + int increase_denoising) { + unsigned char *running_avg_uv_start = running_avg_uv; + unsigned char *sig_start = sig; + int sum_diff_thresh; + int r, c; + int sum_diff = 0; + int sum_block = 0; + int adj_val[3] = {3, 4, 6}; + int shift_inc1 = 0; + int shift_inc2 = 1; + /* If motion_magnitude is small, making the denoiser more aggressive by + * increasing the adjustment for each level. Add another increment for + * blocks that are labeled for increase denoising. */ + if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) { + if (increase_denoising) { + shift_inc1 = 1; + shift_inc2 = 2; + } + adj_val[0] += shift_inc2; + adj_val[1] += shift_inc2; + adj_val[2] += shift_inc2; + } + + // Avoid denoising color signal if its close to average level. + for (r = 0; r < 8; ++r) { + for (c = 0; c < 8; ++c) { + sum_block += sig[c]; + } + sig += sig_stride; + } + if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) { + return COPY_BLOCK; + } + + sig -= sig_stride * 8; + for (r = 0; r < 8; ++r) { + for (c = 0; c < 8; ++c) { + int diff = 0; + int adjustment = 0; + int absdiff = 0; + + diff = mc_running_avg_uv[c] - sig[c]; + absdiff = abs(diff); + + // When |diff| <= |3 + shift_inc1|, use pixel value from + // last denoised raw. + if (absdiff <= 3 + shift_inc1) { + running_avg_uv[c] = mc_running_avg_uv[c]; + sum_diff += diff; + } else { + if (absdiff >= 4 && absdiff <= 7) + adjustment = adj_val[0]; + else if (absdiff >= 8 && absdiff <= 15) + adjustment = adj_val[1]; + else + adjustment = adj_val[2]; + if (diff > 0) { + if ((sig[c] + adjustment) > 255) + running_avg_uv[c] = 255; + else + running_avg_uv[c] = sig[c] + adjustment; + sum_diff += adjustment; + } else { + if ((sig[c] - adjustment) < 0) + running_avg_uv[c] = 0; + else + running_avg_uv[c] = sig[c] - adjustment; + sum_diff -= adjustment; + } + } + } + /* Update pointers for next iteration. */ + sig += sig_stride; + mc_running_avg_uv += mc_avg_uv_stride; + running_avg_uv += avg_uv_stride; + } + + sum_diff_thresh= SUM_DIFF_THRESHOLD_UV; + if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV; + if (abs(sum_diff) > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), check + // if we can still apply some (weaker) temporal filtering to this block, + // that would otherwise not be denoised at all. Simplest is to apply + // an additional adjustment to running_avg_y to bring it closer to sig. + // The adjustment is capped by a maximum delta, and chosen such that + // in most cases the resulting sum_diff will be within the + // accceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over threshold. + int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + sig -= sig_stride * 8; + mc_running_avg_uv -= mc_avg_uv_stride * 8; + running_avg_uv -= avg_uv_stride * 8; + for (r = 0; r < 8; ++r) { + for (c = 0; c < 8; ++c) { + int diff = mc_running_avg_uv[c] - sig[c]; + int adjustment = abs(diff); + if (adjustment > delta) + adjustment = delta; + if (diff > 0) { + // Bring denoised signal down. + if (running_avg_uv[c] - adjustment < 0) + running_avg_uv[c] = 0; + else + running_avg_uv[c] = running_avg_uv[c] - adjustment; + sum_diff -= adjustment; + } else if (diff < 0) { + // Bring denoised signal up. + if (running_avg_uv[c] + adjustment > 255) + running_avg_uv[c] = 255; + else + running_avg_uv[c] = running_avg_uv[c] + adjustment; + sum_diff += adjustment; + } + } + // TODO(marpan): Check here if abs(sum_diff) has gone below the + // threshold sum_diff_thresh, and if so, we can exit the row loop. + sig += sig_stride; + mc_running_avg_uv += mc_avg_uv_stride; + running_avg_uv += avg_uv_stride; + } + if (abs(sum_diff) > sum_diff_thresh) + return COPY_BLOCK; + } else { + return COPY_BLOCK; + } + } + + vp8_copy_mem8x8(running_avg_uv_start, avg_uv_stride, sig_start, + sig_stride); + return FILTER_BLOCK; +} + int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height, int num_mb_rows, int num_mb_cols) { @@ -260,6 +402,8 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, unsigned int motion_magnitude2; unsigned int sse_thresh; int sse_diff_thresh = 0; + // Denoise the UV channel. + int apply_color_denoise = 0; // Spatial loop filter: only applied selectively based on // temporal filter state of block relative to top/left neighbors. int apply_spatial_loop_filter = 1; @@ -267,6 +411,8 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, MV_REFERENCE_FRAME zero_frame = x->best_zeromv_reference_frame; enum vp8_denoiser_decision decision = FILTER_BLOCK; + enum vp8_denoiser_decision decision_u = FILTER_BLOCK; + enum vp8_denoiser_decision decision_v = FILTER_BLOCK; if (zero_frame) { @@ -376,11 +522,37 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, /* Filter. */ decision = vp8_denoiser_filter(mc_running_avg_y, mc_avg_y_stride, - running_avg_y, avg_y_stride, - x->thismb, 16, motion_magnitude2, - x->increase_denoising); + running_avg_y, avg_y_stride, + x->thismb, 16, motion_magnitude2, + x->increase_denoising); denoiser->denoise_state[block_index] = motion_magnitude2 > 0 ? kFilterNonZeroMV : kFilterZeroMV; + // Only denoise UV for zero motion, and if y channel was denoised. + if (apply_color_denoise && + motion_magnitude2 == 0 && + decision == FILTER_BLOCK) { + unsigned char *mc_running_avg_u = + denoiser->yv12_mc_running_avg.u_buffer + recon_uvoffset; + unsigned char *running_avg_u = + denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset; + unsigned char *mc_running_avg_v = + denoiser->yv12_mc_running_avg.v_buffer + recon_uvoffset; + unsigned char *running_avg_v = + denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset; + int mc_avg_uv_stride = denoiser->yv12_mc_running_avg.uv_stride; + int avg_uv_stride = denoiser->yv12_running_avg[INTRA_FRAME].uv_stride; + int signal_stride = x->block[16].src_stride; + decision_u = + vp8_denoiser_filter_uv(mc_running_avg_u, mc_avg_uv_stride, + running_avg_u, avg_uv_stride, + x->block[16].src + *x->block[16].base_src, + signal_stride, motion_magnitude2, 0); + decision_v = + vp8_denoiser_filter_uv(mc_running_avg_v, mc_avg_uv_stride, + running_avg_v, avg_uv_stride, + x->block[20].src + *x->block[20].base_src, + signal_stride, motion_magnitude2, 0); + } } if (decision == COPY_BLOCK) { @@ -393,7 +565,21 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, denoiser->yv12_running_avg[INTRA_FRAME].y_stride); denoiser->denoise_state[block_index] = kNoFilter; } - // Option to selectively deblock the denoised signal. + if (apply_color_denoise) { + if (decision_u == COPY_BLOCK) { + vp8_copy_mem8x8( + x->block[16].src + *x->block[16].base_src, x->block[16].src_stride, + denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset, + denoiser->yv12_running_avg[INTRA_FRAME].uv_stride); + } + if (decision_v == COPY_BLOCK) { + vp8_copy_mem8x8( + x->block[20].src + *x->block[20].base_src, x->block[16].src_stride, + denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset, + denoiser->yv12_running_avg[INTRA_FRAME].uv_stride); + } + } + // Option to selectively deblock the denoised signal, for y channel only. if (apply_spatial_loop_filter) { loop_filter_info lfi; int apply_filter_col = 0; diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h index 6db0785a0..8f1bfa51d 100644 --- a/vp8/encoder/denoising.h +++ b/vp8/encoder/denoising.h @@ -22,6 +22,11 @@ extern "C" { #define SUM_DIFF_THRESHOLD_HIGH (16 * 16 * 3) #define MOTION_MAGNITUDE_THRESHOLD (8*3) +#define SUM_DIFF_THRESHOLD_UV (96) // (8 * 8 * 1.5) +#define SUM_DIFF_THRESHOLD_HIGH_UV (8 * 8 * 2) +#define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 4) +#define MOTION_MAGNITUDE_THRESHOLD_UV (8*3) + enum vp8_denoiser_decision { COPY_BLOCK, diff --git a/vp8/encoder/x86/denoising_sse2.c b/vp8/encoder/x86/denoising_sse2.c index ff439dd64..4d1452cc1 100644 --- a/vp8/encoder/x86/denoising_sse2.c +++ b/vp8/encoder/x86/denoising_sse2.c @@ -17,10 +17,23 @@ #include #include "vpx_ports/emmintrin_compat.h" -union sum_union { - __m128i v; - signed char e[16]; -}; +/* Compute the sum of all pixel differences of this MB. */ +static inline unsigned int abs_sum_diff_16x1(__m128i acc_diff) { + const __m128i k_1 = _mm_set1_epi16(1); + const __m128i acc_diff_lo = _mm_srai_epi16( + _mm_unpacklo_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_hi = _mm_srai_epi16( + _mm_unpackhi_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi); + const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1); + const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba, + _mm_srli_si128(hg_fe_dc_ba, 8)); + const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba, + _mm_srli_si128(hgfe_dcba, 4)); + unsigned int sum_diff = _mm_cvtsi128_si32(hgfedcba); + + return abs(sum_diff); +} int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, @@ -103,16 +116,10 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, { /* Compute the sum of all pixel differences of this MB. */ - union sum_union s; - int sum_diff = 0; - s.v = acc_diff; - sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5] - + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11] - + s.e[12] + s.e[13] + s.e[14] + s.e[15]; - + unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff); sum_diff_thresh = SUM_DIFF_THRESHOLD; if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH; - if (abs(sum_diff) > sum_diff_thresh) { + if (abs_sum_diff > sum_diff_thresh) { // Before returning to copy the block (i.e., apply no denoising), // checK if we can still apply some (weaker) temporal filtering to // this block, that would otherwise not be denoised at all. Simplest @@ -123,7 +130,7 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, // The delta is set by the excess of absolute pixel diff over the // threshold. - int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1; + int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1; // Only apply the adjustment for max delta up to 3. if (delta < 4) { const __m128i k_delta = _mm_set1_epi8(delta); @@ -162,16 +169,9 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, mc_running_avg_y += mc_avg_y_stride; running_avg_y += avg_y_stride; } - { - // Update the sum of all pixel differences of this MB. - union sum_union s; - s.v = acc_diff; - sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5] - + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11] - + s.e[12] + s.e[13] + s.e[14] + s.e[15]; - if (abs(sum_diff) > sum_diff_thresh) { - return COPY_BLOCK; - } + abs_sum_diff = abs_sum_diff_16x1(acc_diff); + if (abs_sum_diff > sum_diff_thresh) { + return COPY_BLOCK; } } else { return COPY_BLOCK; @@ -182,3 +182,198 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride); return FILTER_BLOCK; } + +int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, + int mc_avg_stride, + unsigned char *running_avg, int avg_stride, + unsigned char *sig, int sig_stride, + unsigned int motion_magnitude, + int increase_denoising) { + unsigned char *running_avg_start = running_avg; + unsigned char *sig_start = sig; + int sum_diff_thresh; + int r; + int shift_inc = (increase_denoising && + motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 1 : 0; + __m128i acc_diff = _mm_setzero_si128(); + const __m128i k_0 = _mm_setzero_si128(); + const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); + const __m128i k_8 = _mm_set1_epi8(8); + const __m128i k_16 = _mm_set1_epi8(16); + /* Modify each level's adjustment according to motion_magnitude. */ + const __m128i l3 = _mm_set1_epi8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? + 7 + shift_inc : 6); + /* Difference between level 3 and level 2 is 2. */ + const __m128i l32 = _mm_set1_epi8(2); + /* Difference between level 2 and level 1 is 1. */ + const __m128i l21 = _mm_set1_epi8(1); + + { + const __m128i k_1 = _mm_set1_epi16(1); + __m128i vec_sum_block = _mm_setzero_si128(); + + // Avoid denoising color signal if its close to average level. + for (r = 0; r < 8; ++r) { + const __m128i v_sig = _mm_loadl_epi64((__m128i *)(&sig[0])); + const __m128i v_sig_unpack = _mm_unpacklo_epi8(v_sig, k_0); + vec_sum_block = _mm_add_epi16(vec_sum_block, v_sig_unpack); + sig += sig_stride; + } + sig -= sig_stride * 8; + { + const __m128i hg_fe_dc_ba = _mm_madd_epi16(vec_sum_block, k_1); + const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba, + _mm_srli_si128(hg_fe_dc_ba, 8)); + const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba, + _mm_srli_si128(hgfe_dcba, 4)); + const int sum_block = _mm_cvtsi128_si32(hgfedcba); + if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) { + return COPY_BLOCK; + } + } + } + + for (r = 0; r < 4; ++r) { + /* Calculate differences */ + const __m128i v_sig_low = _mm_castpd_si128( + _mm_load_sd((double *)(&sig[0]))); + const __m128i v_sig = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_sig_low), + (double *)(&sig[sig_stride]))); + const __m128i v_mc_running_avg_low = _mm_castpd_si128( + _mm_load_sd((double *)(&mc_running_avg[0]))); + const __m128i v_mc_running_avg = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low), + (double *)(&mc_running_avg[mc_avg_stride]))); + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg); + /* Obtain the sign. FF if diff is negative. */ + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + /* Clamp absolute difference to 16 to be used to get mask. Doing this + * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */ + const __m128i clamped_absdiff = _mm_min_epu8( + _mm_or_si128(pdiff, ndiff), k_16); + /* Get masks for l2 l1 and l0 adjustments */ + const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff); + const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff); + const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff); + /* Get adjustments for l2, l1, and l0 */ + __m128i adj2 = _mm_and_si128(mask2, l32); + const __m128i adj1 = _mm_and_si128(mask1, l21); + const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); + __m128i adj, padj, nadj; + __m128i v_running_avg; + + /* Combine the adjustments and get absolute adjustments. */ + adj2 = _mm_add_epi8(adj2, adj1); + adj = _mm_sub_epi8(l3, adj2); + adj = _mm_andnot_si128(mask0, adj); + adj = _mm_or_si128(adj, adj0); + + /* Restore the sign and get positive and negative adjustments. */ + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + + /* Calculate filtered value. */ + v_running_avg = _mm_adds_epu8(v_sig, padj); + v_running_avg = _mm_subs_epu8(v_running_avg, nadj); + + _mm_storel_pd((double *)&running_avg[0], + _mm_castsi128_pd(v_running_avg)); + _mm_storeh_pd((double *)&running_avg[avg_stride], + _mm_castsi128_pd(v_running_avg)); + + /* Adjustments <=7, and each element in acc_diff can fit in signed + * char. + */ + acc_diff = _mm_adds_epi8(acc_diff, padj); + acc_diff = _mm_subs_epi8(acc_diff, nadj); + + /* Update pointers for next iteration. */ + sig += sig_stride * 2; + mc_running_avg += mc_avg_stride * 2; + running_avg += avg_stride * 2; + } + + { + unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff); + sum_diff_thresh = SUM_DIFF_THRESHOLD_UV; + if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV; + if (abs_sum_diff > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // checK if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the accceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const __m128i k_delta = _mm_set1_epi8(delta); + sig -= sig_stride * 8; + mc_running_avg -= mc_avg_stride * 8; + running_avg -= avg_stride * 8; + for (r = 0; r < 4; ++r) { + // Calculate differences. + const __m128i v_sig_low = _mm_castpd_si128( + _mm_load_sd((double *)(&sig[0]))); + const __m128i v_sig = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_sig_low), + (double *)(&sig[sig_stride]))); + const __m128i v_mc_running_avg_low = _mm_castpd_si128( + _mm_load_sd((double *)(&mc_running_avg[0]))); + const __m128i v_mc_running_avg = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low), + (double *)(&mc_running_avg[mc_avg_stride]))); + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg); + // Obtain the sign. FF if diff is negative. + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + // Clamp absolute difference to delta to get the adjustment. + const __m128i adj = + _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); + // Restore the sign and get positive and negative adjustments. + __m128i padj, nadj; + const __m128i v_running_avg_low = _mm_castpd_si128( + _mm_load_sd((double *)(&running_avg[0]))); + __m128i v_running_avg = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_running_avg_low), + (double *)(&running_avg[avg_stride]))); + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + // Calculate filtered value. + v_running_avg = _mm_subs_epu8(v_running_avg, padj); + v_running_avg = _mm_adds_epu8(v_running_avg, nadj); + + _mm_storel_pd((double *)&running_avg[0], + _mm_castsi128_pd(v_running_avg)); + _mm_storeh_pd((double *)&running_avg[avg_stride], + _mm_castsi128_pd(v_running_avg)); + + // Accumulate the adjustments. + acc_diff = _mm_subs_epi8(acc_diff, padj); + acc_diff = _mm_adds_epi8(acc_diff, nadj); + + // Update pointers for next iteration. + sig += sig_stride * 2; + mc_running_avg += mc_avg_stride * 2; + running_avg += avg_stride * 2; + } + abs_sum_diff = abs_sum_diff_16x1(acc_diff); + if (abs_sum_diff > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + + vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride); + return FILTER_BLOCK; +}