diff --git a/configure b/configure index 7b9c21184..3ed976c83 100755 --- a/configure +++ b/configure @@ -281,7 +281,7 @@ EXPERIMENT_LIST=" spatial_svc vp9_temporal_denoising fp_mb_stats - emulate_hardware_highbitdepth + emulate_hardware " CONFIG_LIST=" external_build diff --git a/test/convolve_test.cc b/test/convolve_test.cc index de947aadc..3342e107e 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -217,7 +217,7 @@ void high_filter_block2d_8_c(const uint16_t *src_ptr, (VP9_FILTER_WEIGHT >> 1); // Rounding // Normalize back to 0-255... - *output_ptr = clip_pixel_high(temp >> VP9_FILTER_SHIFT, bd); + *output_ptr = clip_pixel_highbd(temp >> VP9_FILTER_SHIFT, bd); ++src_ptr; output_ptr += intermediate_height; } @@ -245,7 +245,7 @@ void high_filter_block2d_8_c(const uint16_t *src_ptr, (VP9_FILTER_WEIGHT >> 1); // Rounding // Normalize back to 0-255... - *dst_ptr++ = clip_pixel_high(temp >> VP9_FILTER_SHIFT, bd); + *dst_ptr++ = clip_pixel_highbd(temp >> VP9_FILTER_SHIFT, bd); src_ptr += intermediate_height; } src_ptr += intermediate_next_stride; diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index d1ce1097d..ff2d83111 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -745,7 +745,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8))); #endif -#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( NEON, Trans16x16DCT, ::testing::Values( @@ -753,7 +753,7 @@ INSTANTIATE_TEST_CASE_P( &vp9_idct16x16_256_add_neon, 0, VPX_BITS_8))); #endif -#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( SSE2, Trans16x16DCT, ::testing::Values( @@ -772,7 +772,7 @@ INSTANTIATE_TEST_CASE_P( VPX_BITS_8))); #endif -#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( SSSE3, Trans16x16DCT, ::testing::Values( diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc index c7a1931af..044373e74 100644 --- a/test/dct32x32_test.cc +++ b/test/dct32x32_test.cc @@ -333,7 +333,7 @@ INSTANTIATE_TEST_CASE_P( &vp9_idct32x32_1024_add_c, 1, VPX_BITS_8))); #endif -#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( NEON, Trans32x32Test, ::testing::Values( @@ -343,7 +343,7 @@ INSTANTIATE_TEST_CASE_P( &vp9_idct32x32_1024_add_neon, 1, VPX_BITS_8))); #endif -#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( SSE2, Trans32x32Test, ::testing::Values( @@ -353,7 +353,7 @@ INSTANTIATE_TEST_CASE_P( &vp9_idct32x32_1024_add_sse2, 1, VPX_BITS_8))); #endif -#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( AVX2, Trans32x32Test, ::testing::Values( diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc index f803c8e25..d75bd69f3 100644 --- a/test/fdct4x4_test.cc +++ b/test/fdct4x4_test.cc @@ -458,7 +458,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8))); #endif -#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( NEON, Trans4x4DCT, ::testing::Values( @@ -473,14 +473,15 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8))); #endif -#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH +#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH && \ + !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( MMX, Trans4x4WHT, ::testing::Values( make_tuple(&vp9_fwht4x4_mmx, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8))); #endif -#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( SSE2, Trans4x4DCT, ::testing::Values( diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 60d0be51d..44a26545e 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -568,7 +568,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8))); #endif -#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( NEON, FwdTrans8x8DCT, ::testing::Values( @@ -583,7 +583,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8))); #endif -#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( SSE2, FwdTrans8x8DCT, ::testing::Values( @@ -598,7 +598,8 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3, VPX_BITS_8))); #endif -#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \ + !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( SSSE3, FwdTrans8x8DCT, ::testing::Values( diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index 9c24fee5c..536273e3e 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -260,7 +260,7 @@ INSTANTIATE_TEST_CASE_P( TX_4X4, 1))); #endif -#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( SSE2, PartialIDctTest, ::testing::Values( @@ -294,7 +294,8 @@ INSTANTIATE_TEST_CASE_P( TX_4X4, 1))); #endif -#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \ + !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( SSSE3_64, PartialIDctTest, ::testing::Values( @@ -304,7 +305,7 @@ INSTANTIATE_TEST_CASE_P( TX_8X8, 12))); #endif -#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( SSSE3, PartialIDctTest, ::testing::Values( diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h index 530d86f34..6801dd3a2 100644 --- a/vp9/common/vp9_common.h +++ b/vp9/common/vp9_common.h @@ -65,7 +65,7 @@ static INLINE int get_unsigned_bits(unsigned int num_values) { } #if CONFIG_VP9_HIGHBITDEPTH -static INLINE uint16_t clip_pixel_high(int val, int bd) { +static INLINE uint16_t clip_pixel_highbd(int val, int bd) { switch (bd) { case 8: default: diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c index 973d0244c..f5ed6b261 100644 --- a/vp9/common/vp9_convolve.c +++ b/vp9/common/vp9_convolve.c @@ -299,7 +299,7 @@ static void high_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, int k, sum = 0; for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; - dst[x] = clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); x_q4 += x_step_q4; } src += src_stride; @@ -325,7 +325,7 @@ static void high_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; dst[x] = ROUND_POWER_OF_TWO(dst[x] + - clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1); + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1); x_q4 += x_step_q4; } src += src_stride; @@ -350,7 +350,7 @@ static void high_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, int k, sum = 0; for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_y[k * src_stride] * y_filter[k]; - dst[y * dst_stride] = clip_pixel_high( + dst[y * dst_stride] = clip_pixel_highbd( ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); y_q4 += y_step_q4; } @@ -377,7 +377,7 @@ static void high_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_y[k * src_stride] * y_filter[k]; dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + - clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1); + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1); y_q4 += y_step_q4; } ++src; diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index 5b031752b..093160fc3 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -15,39 +15,49 @@ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_idct.h" -#if CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH -// When CONFIG_EMULATE_HW_HIGHBITDEPTH is 1 the transform performs strict -// overflow wrapping to match expected hardware implementations. +#if CONFIG_EMULATE_HARDWARE +// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a +// non-normative method to handle overflows. A stream that causes +// overflows in the inverse transform is considered invalid in VP9, +// and a hardware implementer is free to choose any reasonable +// method to handle overflows. However to aid in hardware +// verification they can use a specific implementation of the +// WRAPLOW() macro below that is identical to their intended +// hardware implementation (and also use configure options to trigger +// the C-implementation of the transform). +// +// The particular WRAPLOW implementation below performs strict +// overflow wrapping to match common hardware implementations. // bd of 8 uses trans_low with 16bits, need to remove 16bits // bd of 10 uses trans_low with 18bits, need to remove 14bits // bd of 12 uses trans_low with 20bits, need to remove 12bits // bd of x uses trans_low with 8+x bits, need to remove 24-x bits -#define WRAPLOW(x) ((((int32_t)x) << (24 - bd)) >> (24 - bd)) +#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd)) #else -#define WRAPLOW(x) (x) -#endif // CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH +#define WRAPLOW(x, bd) (x) +#endif // CONFIG_EMULATE_HARDWARE #if CONFIG_VP9_HIGHBITDEPTH -static INLINE tran_low_t clamp_high(tran_high_t value, tran_low_t low, - tran_low_t high) { - return value < low ? low : (value > high ? high : value); -} - -static INLINE tran_low_t clip_pixel_bd_high(tran_high_t dest, - tran_high_t trans, int bd) { - trans = WRAPLOW(trans); +static INLINE tran_low_t highbd_clip_pixel_add(tran_high_t dest, + tran_high_t trans, int bd) { + trans = WRAPLOW(trans, bd); switch (bd) { case 8: default: - return clamp_high(WRAPLOW(dest + trans), 0, 255); + return clamp(WRAPLOW(dest + trans, bd), 0, 255); case 10: - return clamp_high(WRAPLOW(dest + trans), 0, 1023); + return clamp(WRAPLOW(dest + trans, bd), 0, 1023); case 12: - return clamp_high(WRAPLOW(dest + trans), 0, 4095); + return clamp(WRAPLOW(dest + trans, bd), 0, 4095); } } #endif // CONFIG_VP9_HIGHBITDEPTH +static INLINE tran_low_t clip_pixel_add(tran_high_t dest, tran_high_t trans) { + trans = WRAPLOW(trans, 8); + return clamp(WRAPLOW(dest + trans, 8), 0, 255); +} + void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 0.5 shifts per pixel. */ @@ -69,10 +79,10 @@ void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { c1 = e1 - c1; a1 -= b1; d1 += c1; - op[0] = a1; - op[1] = b1; - op[2] = c1; - op[3] = d1; + op[0] = WRAPLOW(a1, 8); + op[1] = WRAPLOW(b1, 8); + op[2] = WRAPLOW(c1, 8); + op[3] = WRAPLOW(d1, 8); ip += 4; op += 4; } @@ -90,10 +100,10 @@ void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { c1 = e1 - c1; a1 -= b1; d1 += c1; - dest[stride * 0] = clip_pixel(dest[stride * 0] + a1); - dest[stride * 1] = clip_pixel(dest[stride * 1] + b1); - dest[stride * 2] = clip_pixel(dest[stride * 2] + c1); - dest[stride * 3] = clip_pixel(dest[stride * 3] + d1); + dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1); + dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1); + dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1); + dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1); ip++; dest++; @@ -110,17 +120,17 @@ void vp9_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { a1 = ip[0] >> UNIT_QUANT_SHIFT; e1 = a1 >> 1; a1 -= e1; - op[0] = a1; - op[1] = op[2] = op[3] = e1; + op[0] = WRAPLOW(a1, 8); + op[1] = op[2] = op[3] = WRAPLOW(e1, 8); ip = tmp; for (i = 0; i < 4; i++) { e1 = ip[0] >> 1; a1 = ip[0] - e1; - dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1); - dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1); - dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1); - dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1); + dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1); + dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1); + dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1); + dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1); ip++; dest++; } @@ -132,18 +142,18 @@ static void idct4(const tran_low_t *input, tran_low_t *output) { // stage 1 temp1 = (input[0] + input[2]) * cospi_16_64; temp2 = (input[0] - input[2]) * cospi_16_64; - step[0] = dct_const_round_shift(temp1); - step[1] = dct_const_round_shift(temp2); + step[0] = WRAPLOW(dct_const_round_shift(temp1), 8); + step[1] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step[2] = dct_const_round_shift(temp1); - step[3] = dct_const_round_shift(temp2); + step[2] = WRAPLOW(dct_const_round_shift(temp1), 8); + step[3] = WRAPLOW(dct_const_round_shift(temp2), 8); // stage 2 - output[0] = step[0] + step[3]; - output[1] = step[1] + step[2]; - output[2] = step[1] - step[2]; - output[3] = step[0] - step[3]; + output[0] = WRAPLOW(step[0] + step[3], 8); + output[1] = WRAPLOW(step[1] + step[2], 8); + output[2] = WRAPLOW(step[1] - step[2], 8); + output[3] = WRAPLOW(step[0] - step[3], 8); } void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { @@ -164,9 +174,10 @@ void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; idct4(temp_in, temp_out); - for (j = 0; j < 4; ++j) - dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) - + dest[j * stride + i]); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = clip_pixel_add(ROUND_POWER_OF_TWO(temp_out[j], 4), + dest[j * stride + i]); + } } } @@ -174,15 +185,15 @@ void vp9_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride) { int i; tran_high_t a1; - tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); a1 = ROUND_POWER_OF_TWO(out, 4); for (i = 0; i < 4; i++) { - dest[0] = clip_pixel(dest[0] + a1); - dest[1] = clip_pixel(dest[1] + a1); - dest[2] = clip_pixel(dest[2] + a1); - dest[3] = clip_pixel(dest[3] + a1); + dest[0] = clip_pixel_add(dest[0], a1); + dest[1] = clip_pixel_add(dest[1], a1); + dest[2] = clip_pixel_add(dest[2], a1); + dest[3] = clip_pixel_add(dest[3], a1); dest += dest_stride; } } @@ -197,39 +208,39 @@ static void idct8(const tran_low_t *input, tran_low_t *output) { step1[3] = input[6]; temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1[4] = dct_const_round_shift(temp1); - step1[7] = dct_const_round_shift(temp2); + step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1[5] = dct_const_round_shift(temp1); - step1[6] = dct_const_round_shift(temp2); + step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); // stage 2 & stage 3 - even half idct4(step1, step1); // stage 2 - odd half - step2[4] = step1[4] + step1[5]; - step2[5] = step1[4] - step1[5]; - step2[6] = -step1[6] + step1[7]; - step2[7] = step1[6] + step1[7]; + step2[4] = WRAPLOW(step1[4] + step1[5], 8); + step2[5] = WRAPLOW(step1[4] - step1[5], 8); + step2[6] = WRAPLOW(-step1[6] + step1[7], 8); + step2[7] = WRAPLOW(step1[6] + step1[7], 8); // stage 3 -odd half step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = dct_const_round_shift(temp1); - step1[6] = dct_const_round_shift(temp2); + step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); step1[7] = step2[7]; // stage 4 - output[0] = step1[0] + step1[7]; - output[1] = step1[1] + step1[6]; - output[2] = step1[2] + step1[5]; - output[3] = step1[3] + step1[4]; - output[4] = step1[3] - step1[4]; - output[5] = step1[2] - step1[5]; - output[6] = step1[1] - step1[6]; - output[7] = step1[0] - step1[7]; + output[0] = WRAPLOW(step1[0] + step1[7], 8); + output[1] = WRAPLOW(step1[1] + step1[6], 8); + output[2] = WRAPLOW(step1[2] + step1[5], 8); + output[3] = WRAPLOW(step1[3] + step1[4], 8); + output[4] = WRAPLOW(step1[3] - step1[4], 8); + output[5] = WRAPLOW(step1[2] - step1[5], 8); + output[6] = WRAPLOW(step1[1] - step1[6], 8); + output[7] = WRAPLOW(step1[0] - step1[7], 8); } void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { @@ -250,21 +261,22 @@ void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; idct8(temp_in, temp_out); - for (j = 0; j < 8; ++j) - dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) - + dest[j * stride + i]); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = clip_pixel_add(ROUND_POWER_OF_TWO(temp_out[j], 5), + dest[j * stride + i]); + } } } void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; tran_high_t a1; - tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); a1 = ROUND_POWER_OF_TWO(out, 5); for (j = 0; j < 8; ++j) { for (i = 0; i < 8; ++i) - dest[i] = clip_pixel(dest[i] + a1); + dest[i] = clip_pixel_add(dest[i], a1); dest += stride; } } @@ -305,10 +317,10 @@ static void iadst4(const tran_low_t *input, tran_low_t *output) { // The overall dynamic range is 14b (input) + 14b (multiplication scaling) // + 1b (addition) = 29b. // Hence the output bit depth is 15b. - output[0] = dct_const_round_shift(s0); - output[1] = dct_const_round_shift(s1); - output[2] = dct_const_round_shift(s2); - output[3] = dct_const_round_shift(s3); + output[0] = WRAPLOW(dct_const_round_shift(s0), 8); + output[1] = WRAPLOW(dct_const_round_shift(s1), 8); + output[2] = WRAPLOW(dct_const_round_shift(s2), 8); + output[3] = WRAPLOW(dct_const_round_shift(s3), 8); } void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, @@ -337,11 +349,13 @@ void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; IHT_4[tx_type].cols(temp_in, temp_out); - for (j = 0; j < 4; ++j) - dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) - + dest[j * stride + i]); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = clip_pixel_add(ROUND_POWER_OF_TWO(temp_out[j], 4), + dest[j * stride + i]); + } } } + static void iadst8(const tran_low_t *input, tran_low_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; @@ -370,14 +384,14 @@ static void iadst8(const tran_low_t *input, tran_low_t *output) { s6 = cospi_26_64 * x6 + cospi_6_64 * x7; s7 = cospi_6_64 * x6 - cospi_26_64 * x7; - x0 = dct_const_round_shift(s0 + s4); - x1 = dct_const_round_shift(s1 + s5); - x2 = dct_const_round_shift(s2 + s6); - x3 = dct_const_round_shift(s3 + s7); - x4 = dct_const_round_shift(s0 - s4); - x5 = dct_const_round_shift(s1 - s5); - x6 = dct_const_round_shift(s2 - s6); - x7 = dct_const_round_shift(s3 - s7); + x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8); + x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8); + x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8); + x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8); + x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8); + x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8); + x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8); + x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8); // stage 2 s0 = x0; @@ -389,14 +403,14 @@ static void iadst8(const tran_low_t *input, tran_low_t *output) { s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; s7 = cospi_8_64 * x6 + cospi_24_64 * x7; - x0 = s0 + s2; - x1 = s1 + s3; - x2 = s0 - s2; - x3 = s1 - s3; - x4 = dct_const_round_shift(s4 + s6); - x5 = dct_const_round_shift(s5 + s7); - x6 = dct_const_round_shift(s4 - s6); - x7 = dct_const_round_shift(s5 - s7); + x0 = WRAPLOW(s0 + s2, 8); + x1 = WRAPLOW(s1 + s3, 8); + x2 = WRAPLOW(s0 - s2, 8); + x3 = WRAPLOW(s1 - s3, 8); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8); // stage 3 s2 = cospi_16_64 * (x2 + x3); @@ -404,19 +418,19 @@ static void iadst8(const tran_low_t *input, tran_low_t *output) { s6 = cospi_16_64 * (x6 + x7); s7 = cospi_16_64 * (x6 - x7); - x2 = dct_const_round_shift(s2); - x3 = dct_const_round_shift(s3); - x6 = dct_const_round_shift(s6); - x7 = dct_const_round_shift(s7); + x2 = WRAPLOW(dct_const_round_shift(s2), 8); + x3 = WRAPLOW(dct_const_round_shift(s3), 8); + x6 = WRAPLOW(dct_const_round_shift(s6), 8); + x7 = WRAPLOW(dct_const_round_shift(s7), 8); - output[0] = x0; - output[1] = -x4; - output[2] = x6; - output[3] = -x2; - output[4] = x3; - output[5] = -x7; - output[6] = x5; - output[7] = -x1; + output[0] = WRAPLOW(x0, 8); + output[1] = WRAPLOW(-x4, 8); + output[2] = WRAPLOW(x6, 8); + output[3] = WRAPLOW(-x2, 8); + output[4] = WRAPLOW(x3, 8); + output[5] = WRAPLOW(-x7, 8); + output[6] = WRAPLOW(x5, 8); + output[7] = WRAPLOW(-x1, 8); } static const transform_2d IHT_8[] = { @@ -446,9 +460,10 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; ht.cols(temp_in, temp_out); - for (j = 0; j < 8; ++j) - dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) - + dest[j * stride + i]); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = clip_pixel_add(ROUND_POWER_OF_TWO(temp_out[j], 5), + dest[j * stride + i]); + } } } @@ -471,9 +486,10 @@ void vp9_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; idct8(temp_in, temp_out); - for (j = 0; j < 8; ++j) - dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) - + dest[j * stride + i]); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = clip_pixel_add(ROUND_POWER_OF_TWO(temp_out[j], 5), + dest[j * stride + i]); + } } } @@ -511,23 +527,23 @@ static void idct16(const tran_low_t *input, tran_low_t *output) { temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = dct_const_round_shift(temp1); - step2[15] = dct_const_round_shift(temp2); + step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = dct_const_round_shift(temp1); - step2[14] = dct_const_round_shift(temp2); + step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = dct_const_round_shift(temp1); - step2[13] = dct_const_round_shift(temp2); + step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = dct_const_round_shift(temp1); - step2[12] = dct_const_round_shift(temp2); + step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); // stage 3 step1[0] = step2[0]; @@ -537,109 +553,109 @@ static void idct16(const tran_low_t *input, tran_low_t *output) { temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = dct_const_round_shift(temp1); - step1[7] = dct_const_round_shift(temp2); + step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = dct_const_round_shift(temp1); - step1[6] = dct_const_round_shift(temp2); + step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); - step1[8] = step2[8] + step2[9]; - step1[9] = step2[8] - step2[9]; - step1[10] = -step2[10] + step2[11]; - step1[11] = step2[10] + step2[11]; - step1[12] = step2[12] + step2[13]; - step1[13] = step2[12] - step2[13]; - step1[14] = -step2[14] + step2[15]; - step1[15] = step2[14] + step2[15]; + step1[8] = WRAPLOW(step2[8] + step2[9], 8); + step1[9] = WRAPLOW(step2[8] - step2[9], 8); + step1[10] = WRAPLOW(-step2[10] + step2[11], 8); + step1[11] = WRAPLOW(step2[10] + step2[11], 8); + step1[12] = WRAPLOW(step2[12] + step2[13], 8); + step1[13] = WRAPLOW(step2[12] - step2[13], 8); + step1[14] = WRAPLOW(-step2[14] + step2[15], 8); + step1[15] = WRAPLOW(step2[14] + step2[15], 8); // stage 4 temp1 = (step1[0] + step1[1]) * cospi_16_64; temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = dct_const_round_shift(temp1); - step2[1] = dct_const_round_shift(temp2); + step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = dct_const_round_shift(temp1); - step2[3] = dct_const_round_shift(temp2); - step2[4] = step1[4] + step1[5]; - step2[5] = step1[4] - step1[5]; - step2[6] = -step1[6] + step1[7]; - step2[7] = step1[6] + step1[7]; + step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[4] = WRAPLOW(step1[4] + step1[5], 8); + step2[5] = WRAPLOW(step1[4] - step1[5], 8); + step2[6] = WRAPLOW(-step1[6] + step1[7], 8); + step2[7] = WRAPLOW(step1[6] + step1[7], 8); step2[8] = step1[8]; step2[15] = step1[15]; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = dct_const_round_shift(temp1); - step2[14] = dct_const_round_shift(temp2); + step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = dct_const_round_shift(temp1); - step2[13] = dct_const_round_shift(temp2); + step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); step2[11] = step1[11]; step2[12] = step1[12]; // stage 5 - step1[0] = step2[0] + step2[3]; - step1[1] = step2[1] + step2[2]; - step1[2] = step2[1] - step2[2]; - step1[3] = step2[0] - step2[3]; + step1[0] = WRAPLOW(step2[0] + step2[3], 8); + step1[1] = WRAPLOW(step2[1] + step2[2], 8); + step1[2] = WRAPLOW(step2[1] - step2[2], 8); + step1[3] = WRAPLOW(step2[0] - step2[3], 8); step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = dct_const_round_shift(temp1); - step1[6] = dct_const_round_shift(temp2); + step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); step1[7] = step2[7]; - step1[8] = step2[8] + step2[11]; - step1[9] = step2[9] + step2[10]; - step1[10] = step2[9] - step2[10]; - step1[11] = step2[8] - step2[11]; - step1[12] = -step2[12] + step2[15]; - step1[13] = -step2[13] + step2[14]; - step1[14] = step2[13] + step2[14]; - step1[15] = step2[12] + step2[15]; + step1[8] = WRAPLOW(step2[8] + step2[11], 8); + step1[9] = WRAPLOW(step2[9] + step2[10], 8); + step1[10] = WRAPLOW(step2[9] - step2[10], 8); + step1[11] = WRAPLOW(step2[8] - step2[11], 8); + step1[12] = WRAPLOW(-step2[12] + step2[15], 8); + step1[13] = WRAPLOW(-step2[13] + step2[14], 8); + step1[14] = WRAPLOW(step2[13] + step2[14], 8); + step1[15] = WRAPLOW(step2[12] + step2[15], 8); // stage 6 - step2[0] = step1[0] + step1[7]; - step2[1] = step1[1] + step1[6]; - step2[2] = step1[2] + step1[5]; - step2[3] = step1[3] + step1[4]; - step2[4] = step1[3] - step1[4]; - step2[5] = step1[2] - step1[5]; - step2[6] = step1[1] - step1[6]; - step2[7] = step1[0] - step1[7]; + step2[0] = WRAPLOW(step1[0] + step1[7], 8); + step2[1] = WRAPLOW(step1[1] + step1[6], 8); + step2[2] = WRAPLOW(step1[2] + step1[5], 8); + step2[3] = WRAPLOW(step1[3] + step1[4], 8); + step2[4] = WRAPLOW(step1[3] - step1[4], 8); + step2[5] = WRAPLOW(step1[2] - step1[5], 8); + step2[6] = WRAPLOW(step1[1] - step1[6], 8); + step2[7] = WRAPLOW(step1[0] - step1[7], 8); step2[8] = step1[8]; step2[9] = step1[9]; temp1 = (-step1[10] + step1[13]) * cospi_16_64; temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = dct_const_round_shift(temp1); - step2[13] = dct_const_round_shift(temp2); + step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = (-step1[11] + step1[12]) * cospi_16_64; temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = dct_const_round_shift(temp1); - step2[12] = dct_const_round_shift(temp2); + step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); step2[14] = step1[14]; step2[15] = step1[15]; // stage 7 - output[0] = step2[0] + step2[15]; - output[1] = step2[1] + step2[14]; - output[2] = step2[2] + step2[13]; - output[3] = step2[3] + step2[12]; - output[4] = step2[4] + step2[11]; - output[5] = step2[5] + step2[10]; - output[6] = step2[6] + step2[9]; - output[7] = step2[7] + step2[8]; - output[8] = step2[7] - step2[8]; - output[9] = step2[6] - step2[9]; - output[10] = step2[5] - step2[10]; - output[11] = step2[4] - step2[11]; - output[12] = step2[3] - step2[12]; - output[13] = step2[2] - step2[13]; - output[14] = step2[1] - step2[14]; - output[15] = step2[0] - step2[15]; + output[0] = WRAPLOW(step2[0] + step2[15], 8); + output[1] = WRAPLOW(step2[1] + step2[14], 8); + output[2] = WRAPLOW(step2[2] + step2[13], 8); + output[3] = WRAPLOW(step2[3] + step2[12], 8); + output[4] = WRAPLOW(step2[4] + step2[11], 8); + output[5] = WRAPLOW(step2[5] + step2[10], 8); + output[6] = WRAPLOW(step2[6] + step2[9], 8); + output[7] = WRAPLOW(step2[7] + step2[8], 8); + output[8] = WRAPLOW(step2[7] - step2[8], 8); + output[9] = WRAPLOW(step2[6] - step2[9], 8); + output[10] = WRAPLOW(step2[5] - step2[10], 8); + output[11] = WRAPLOW(step2[4] - step2[11], 8); + output[12] = WRAPLOW(step2[3] - step2[12], 8); + output[13] = WRAPLOW(step2[2] - step2[13], 8); + output[14] = WRAPLOW(step2[1] - step2[14], 8); + output[15] = WRAPLOW(step2[0] - step2[15], 8); } void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, @@ -661,9 +677,10 @@ void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; idct16(temp_in, temp_out); - for (j = 0; j < 16; ++j) - dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) - + dest[j * stride + i]); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(ROUND_POWER_OF_TWO(temp_out[j], 6), + dest[j * stride + i]); + } } } @@ -715,22 +732,22 @@ static void iadst16(const tran_low_t *input, tran_low_t *output) { s14 = x14 * cospi_29_64 + x15 * cospi_3_64; s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - x0 = dct_const_round_shift(s0 + s8); - x1 = dct_const_round_shift(s1 + s9); - x2 = dct_const_round_shift(s2 + s10); - x3 = dct_const_round_shift(s3 + s11); - x4 = dct_const_round_shift(s4 + s12); - x5 = dct_const_round_shift(s5 + s13); - x6 = dct_const_round_shift(s6 + s14); - x7 = dct_const_round_shift(s7 + s15); - x8 = dct_const_round_shift(s0 - s8); - x9 = dct_const_round_shift(s1 - s9); - x10 = dct_const_round_shift(s2 - s10); - x11 = dct_const_round_shift(s3 - s11); - x12 = dct_const_round_shift(s4 - s12); - x13 = dct_const_round_shift(s5 - s13); - x14 = dct_const_round_shift(s6 - s14); - x15 = dct_const_round_shift(s7 - s15); + x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8); + x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8); + x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8); + x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8); + x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8); + x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8); + x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8); + x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8); + x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8); + x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8); + x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8); + x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8); + x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8); + x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8); + x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8); + x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8); // stage 2 s0 = x0; @@ -750,22 +767,22 @@ static void iadst16(const tran_low_t *input, tran_low_t *output) { s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - x0 = s0 + s4; - x1 = s1 + s5; - x2 = s2 + s6; - x3 = s3 + s7; - x4 = s0 - s4; - x5 = s1 - s5; - x6 = s2 - s6; - x7 = s3 - s7; - x8 = dct_const_round_shift(s8 + s12); - x9 = dct_const_round_shift(s9 + s13); - x10 = dct_const_round_shift(s10 + s14); - x11 = dct_const_round_shift(s11 + s15); - x12 = dct_const_round_shift(s8 - s12); - x13 = dct_const_round_shift(s9 - s13); - x14 = dct_const_round_shift(s10 - s14); - x15 = dct_const_round_shift(s11 - s15); + x0 = WRAPLOW(s0 + s4, 8); + x1 = WRAPLOW(s1 + s5, 8); + x2 = WRAPLOW(s2 + s6, 8); + x3 = WRAPLOW(s3 + s7, 8); + x4 = WRAPLOW(s0 - s4, 8); + x5 = WRAPLOW(s1 - s5, 8); + x6 = WRAPLOW(s2 - s6, 8); + x7 = WRAPLOW(s3 - s7, 8); + x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8); + x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8); + x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8); + x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8); + x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8); + x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8); + x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8); + x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8); // stage 3 s0 = x0; @@ -785,22 +802,22 @@ static void iadst16(const tran_low_t *input, tran_low_t *output) { s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - x0 = s0 + s2; - x1 = s1 + s3; - x2 = s0 - s2; - x3 = s1 - s3; - x4 = dct_const_round_shift(s4 + s6); - x5 = dct_const_round_shift(s5 + s7); - x6 = dct_const_round_shift(s4 - s6); - x7 = dct_const_round_shift(s5 - s7); - x8 = s8 + s10; - x9 = s9 + s11; - x10 = s8 - s10; - x11 = s9 - s11; - x12 = dct_const_round_shift(s12 + s14); - x13 = dct_const_round_shift(s13 + s15); - x14 = dct_const_round_shift(s12 - s14); - x15 = dct_const_round_shift(s13 - s15); + x0 = WRAPLOW(s0 + s2, 8); + x1 = WRAPLOW(s1 + s3, 8); + x2 = WRAPLOW(s0 - s2, 8); + x3 = WRAPLOW(s1 - s3, 8); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8); + x8 = WRAPLOW(s8 + s10, 8); + x9 = WRAPLOW(s9 + s11, 8); + x10 = WRAPLOW(s8 - s10, 8); + x11 = WRAPLOW(s9 - s11, 8); + x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8); + x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8); + x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8); + x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8); // stage 4 s2 = (- cospi_16_64) * (x2 + x3); @@ -812,31 +829,31 @@ static void iadst16(const tran_low_t *input, tran_low_t *output) { s14 = (- cospi_16_64) * (x14 + x15); s15 = cospi_16_64 * (x14 - x15); - x2 = dct_const_round_shift(s2); - x3 = dct_const_round_shift(s3); - x6 = dct_const_round_shift(s6); - x7 = dct_const_round_shift(s7); - x10 = dct_const_round_shift(s10); - x11 = dct_const_round_shift(s11); - x14 = dct_const_round_shift(s14); - x15 = dct_const_round_shift(s15); + x2 = WRAPLOW(dct_const_round_shift(s2), 8); + x3 = WRAPLOW(dct_const_round_shift(s3), 8); + x6 = WRAPLOW(dct_const_round_shift(s6), 8); + x7 = WRAPLOW(dct_const_round_shift(s7), 8); + x10 = WRAPLOW(dct_const_round_shift(s10), 8); + x11 = WRAPLOW(dct_const_round_shift(s11), 8); + x14 = WRAPLOW(dct_const_round_shift(s14), 8); + x15 = WRAPLOW(dct_const_round_shift(s15), 8); - output[0] = x0; - output[1] = -x8; - output[2] = x12; - output[3] = -x4; - output[4] = x6; - output[5] = x14; - output[6] = x10; - output[7] = x2; - output[8] = x3; - output[9] = x11; - output[10] = x15; - output[11] = x7; - output[12] = x5; - output[13] = -x13; - output[14] = x9; - output[15] = -x1; + output[0] = WRAPLOW(x0, 8); + output[1] = WRAPLOW(-x8, 8); + output[2] = WRAPLOW(x12, 8); + output[3] = WRAPLOW(-x4, 8); + output[4] = WRAPLOW(x6, 8); + output[5] = WRAPLOW(x14, 8); + output[6] = WRAPLOW(x10, 8); + output[7] = WRAPLOW(x2, 8); + output[8] = WRAPLOW(x3, 8); + output[9] = WRAPLOW(x11, 8); + output[10] = WRAPLOW(x15, 8); + output[11] = WRAPLOW(x7, 8); + output[12] = WRAPLOW(x5, 8); + output[13] = WRAPLOW(-x13, 8); + output[14] = WRAPLOW(x9, 8); + output[15] = WRAPLOW(-x1, 8); } static const transform_2d IHT_16[] = { @@ -866,9 +883,10 @@ void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; ht.cols(temp_in, temp_out); - for (j = 0; j < 16; ++j) - dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) - + dest[j * stride + i]); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(ROUND_POWER_OF_TWO(temp_out[j], 6), + dest[j * stride + i]); + } } } @@ -892,21 +910,22 @@ void vp9_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, for (j = 0; j < 16; ++j) temp_in[j] = out[j*16 + i]; idct16(temp_in, temp_out); - for (j = 0; j < 16; ++j) - dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) - + dest[j * stride + i]); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(ROUND_POWER_OF_TWO(temp_out[j], 6), + dest[j * stride + i]); + } } } void vp9_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; tran_high_t a1; - tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 16; ++j) { for (i = 0; i < 16; ++i) - dest[i] = clip_pixel(dest[i] + a1); + dest[i] = clip_pixel_add(dest[i], a1); dest += stride; } } @@ -935,43 +954,43 @@ static void idct32(const tran_low_t *input, tran_low_t *output) { temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; - step1[16] = dct_const_round_shift(temp1); - step1[31] = dct_const_round_shift(temp2); + step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; - step1[17] = dct_const_round_shift(temp1); - step1[30] = dct_const_round_shift(temp2); + step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; - step1[18] = dct_const_round_shift(temp1); - step1[29] = dct_const_round_shift(temp2); + step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; - step1[19] = dct_const_round_shift(temp1); - step1[28] = dct_const_round_shift(temp2); + step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; - step1[20] = dct_const_round_shift(temp1); - step1[27] = dct_const_round_shift(temp2); + step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; - step1[21] = dct_const_round_shift(temp1); - step1[26] = dct_const_round_shift(temp2); + step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; - step1[22] = dct_const_round_shift(temp1); - step1[25] = dct_const_round_shift(temp2); + step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; - step1[23] = dct_const_round_shift(temp1); - step1[24] = dct_const_round_shift(temp2); + step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8); // stage 2 step2[0] = step1[0]; @@ -985,40 +1004,40 @@ static void idct32(const tran_low_t *input, tran_low_t *output) { temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = dct_const_round_shift(temp1); - step2[15] = dct_const_round_shift(temp2); + step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = dct_const_round_shift(temp1); - step2[14] = dct_const_round_shift(temp2); + step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = dct_const_round_shift(temp1); - step2[13] = dct_const_round_shift(temp2); + step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = dct_const_round_shift(temp1); - step2[12] = dct_const_round_shift(temp2); + step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); - step2[16] = step1[16] + step1[17]; - step2[17] = step1[16] - step1[17]; - step2[18] = -step1[18] + step1[19]; - step2[19] = step1[18] + step1[19]; - step2[20] = step1[20] + step1[21]; - step2[21] = step1[20] - step1[21]; - step2[22] = -step1[22] + step1[23]; - step2[23] = step1[22] + step1[23]; - step2[24] = step1[24] + step1[25]; - step2[25] = step1[24] - step1[25]; - step2[26] = -step1[26] + step1[27]; - step2[27] = step1[26] + step1[27]; - step2[28] = step1[28] + step1[29]; - step2[29] = step1[28] - step1[29]; - step2[30] = -step1[30] + step1[31]; - step2[31] = step1[30] + step1[31]; + step2[16] = WRAPLOW(step1[16] + step1[17], 8); + step2[17] = WRAPLOW(step1[16] - step1[17], 8); + step2[18] = WRAPLOW(-step1[18] + step1[19], 8); + step2[19] = WRAPLOW(step1[18] + step1[19], 8); + step2[20] = WRAPLOW(step1[20] + step1[21], 8); + step2[21] = WRAPLOW(step1[20] - step1[21], 8); + step2[22] = WRAPLOW(-step1[22] + step1[23], 8); + step2[23] = WRAPLOW(step1[22] + step1[23], 8); + step2[24] = WRAPLOW(step1[24] + step1[25], 8); + step2[25] = WRAPLOW(step1[24] - step1[25], 8); + step2[26] = WRAPLOW(-step1[26] + step1[27], 8); + step2[27] = WRAPLOW(step1[26] + step1[27], 8); + step2[28] = WRAPLOW(step1[28] + step1[29], 8); + step2[29] = WRAPLOW(step1[28] - step1[29], 8); + step2[30] = WRAPLOW(-step1[30] + step1[31], 8); + step2[31] = WRAPLOW(step1[30] + step1[31], 8); // stage 3 step1[0] = step2[0]; @@ -1028,42 +1047,42 @@ static void idct32(const tran_low_t *input, tran_low_t *output) { temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = dct_const_round_shift(temp1); - step1[7] = dct_const_round_shift(temp2); + step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = dct_const_round_shift(temp1); - step1[6] = dct_const_round_shift(temp2); + step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); - step1[8] = step2[8] + step2[9]; - step1[9] = step2[8] - step2[9]; - step1[10] = -step2[10] + step2[11]; - step1[11] = step2[10] + step2[11]; - step1[12] = step2[12] + step2[13]; - step1[13] = step2[12] - step2[13]; - step1[14] = -step2[14] + step2[15]; - step1[15] = step2[14] + step2[15]; + step1[8] = WRAPLOW(step2[8] + step2[9], 8); + step1[9] = WRAPLOW(step2[8] - step2[9], 8); + step1[10] = WRAPLOW(-step2[10] + step2[11], 8); + step1[11] = WRAPLOW(step2[10] + step2[11], 8); + step1[12] = WRAPLOW(step2[12] + step2[13], 8); + step1[13] = WRAPLOW(step2[12] - step2[13], 8); + step1[14] = WRAPLOW(-step2[14] + step2[15], 8); + step1[15] = WRAPLOW(step2[14] + step2[15], 8); step1[16] = step2[16]; step1[31] = step2[31]; temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; - step1[17] = dct_const_round_shift(temp1); - step1[30] = dct_const_round_shift(temp2); + step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; - step1[18] = dct_const_round_shift(temp1); - step1[29] = dct_const_round_shift(temp2); + step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); step1[19] = step2[19]; step1[20] = step2[20]; temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; - step1[21] = dct_const_round_shift(temp1); - step1[26] = dct_const_round_shift(temp2); + step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; - step1[22] = dct_const_round_shift(temp1); - step1[25] = dct_const_round_shift(temp2); + step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); step1[23] = step2[23]; step1[24] = step2[24]; step1[27] = step2[27]; @@ -1072,87 +1091,87 @@ static void idct32(const tran_low_t *input, tran_low_t *output) { // stage 4 temp1 = (step1[0] + step1[1]) * cospi_16_64; temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = dct_const_round_shift(temp1); - step2[1] = dct_const_round_shift(temp2); + step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = dct_const_round_shift(temp1); - step2[3] = dct_const_round_shift(temp2); - step2[4] = step1[4] + step1[5]; - step2[5] = step1[4] - step1[5]; - step2[6] = -step1[6] + step1[7]; - step2[7] = step1[6] + step1[7]; + step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[4] = WRAPLOW(step1[4] + step1[5], 8); + step2[5] = WRAPLOW(step1[4] - step1[5], 8); + step2[6] = WRAPLOW(-step1[6] + step1[7], 8); + step2[7] = WRAPLOW(step1[6] + step1[7], 8); step2[8] = step1[8]; step2[15] = step1[15]; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = dct_const_round_shift(temp1); - step2[14] = dct_const_round_shift(temp2); + step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = dct_const_round_shift(temp1); - step2[13] = dct_const_round_shift(temp2); + step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); step2[11] = step1[11]; step2[12] = step1[12]; - step2[16] = step1[16] + step1[19]; - step2[17] = step1[17] + step1[18]; - step2[18] = step1[17] - step1[18]; - step2[19] = step1[16] - step1[19]; - step2[20] = -step1[20] + step1[23]; - step2[21] = -step1[21] + step1[22]; - step2[22] = step1[21] + step1[22]; - step2[23] = step1[20] + step1[23]; + step2[16] = WRAPLOW(step1[16] + step1[19], 8); + step2[17] = WRAPLOW(step1[17] + step1[18], 8); + step2[18] = WRAPLOW(step1[17] - step1[18], 8); + step2[19] = WRAPLOW(step1[16] - step1[19], 8); + step2[20] = WRAPLOW(-step1[20] + step1[23], 8); + step2[21] = WRAPLOW(-step1[21] + step1[22], 8); + step2[22] = WRAPLOW(step1[21] + step1[22], 8); + step2[23] = WRAPLOW(step1[20] + step1[23], 8); - step2[24] = step1[24] + step1[27]; - step2[25] = step1[25] + step1[26]; - step2[26] = step1[25] - step1[26]; - step2[27] = step1[24] - step1[27]; - step2[28] = -step1[28] + step1[31]; - step2[29] = -step1[29] + step1[30]; - step2[30] = step1[29] + step1[30]; - step2[31] = step1[28] + step1[31]; + step2[24] = WRAPLOW(step1[24] + step1[27], 8); + step2[25] = WRAPLOW(step1[25] + step1[26], 8); + step2[26] = WRAPLOW(step1[25] - step1[26], 8); + step2[27] = WRAPLOW(step1[24] - step1[27], 8); + step2[28] = WRAPLOW(-step1[28] + step1[31], 8); + step2[29] = WRAPLOW(-step1[29] + step1[30], 8); + step2[30] = WRAPLOW(step1[29] + step1[30], 8); + step2[31] = WRAPLOW(step1[28] + step1[31], 8); // stage 5 - step1[0] = step2[0] + step2[3]; - step1[1] = step2[1] + step2[2]; - step1[2] = step2[1] - step2[2]; - step1[3] = step2[0] - step2[3]; + step1[0] = WRAPLOW(step2[0] + step2[3], 8); + step1[1] = WRAPLOW(step2[1] + step2[2], 8); + step1[2] = WRAPLOW(step2[1] - step2[2], 8); + step1[3] = WRAPLOW(step2[0] - step2[3], 8); step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = dct_const_round_shift(temp1); - step1[6] = dct_const_round_shift(temp2); + step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); step1[7] = step2[7]; - step1[8] = step2[8] + step2[11]; - step1[9] = step2[9] + step2[10]; - step1[10] = step2[9] - step2[10]; - step1[11] = step2[8] - step2[11]; - step1[12] = -step2[12] + step2[15]; - step1[13] = -step2[13] + step2[14]; - step1[14] = step2[13] + step2[14]; - step1[15] = step2[12] + step2[15]; + step1[8] = WRAPLOW(step2[8] + step2[11], 8); + step1[9] = WRAPLOW(step2[9] + step2[10], 8); + step1[10] = WRAPLOW(step2[9] - step2[10], 8); + step1[11] = WRAPLOW(step2[8] - step2[11], 8); + step1[12] = WRAPLOW(-step2[12] + step2[15], 8); + step1[13] = WRAPLOW(-step2[13] + step2[14], 8); + step1[14] = WRAPLOW(step2[13] + step2[14], 8); + step1[15] = WRAPLOW(step2[12] + step2[15], 8); step1[16] = step2[16]; step1[17] = step2[17]; temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; - step1[18] = dct_const_round_shift(temp1); - step1[29] = dct_const_round_shift(temp2); + step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; - step1[19] = dct_const_round_shift(temp1); - step1[28] = dct_const_round_shift(temp2); + step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; - step1[20] = dct_const_round_shift(temp1); - step1[27] = dct_const_round_shift(temp2); + step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; - step1[21] = dct_const_round_shift(temp1); - step1[26] = dct_const_round_shift(temp2); + step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); step1[22] = step2[22]; step1[23] = step2[23]; step1[24] = step2[24]; @@ -1161,62 +1180,62 @@ static void idct32(const tran_low_t *input, tran_low_t *output) { step1[31] = step2[31]; // stage 6 - step2[0] = step1[0] + step1[7]; - step2[1] = step1[1] + step1[6]; - step2[2] = step1[2] + step1[5]; - step2[3] = step1[3] + step1[4]; - step2[4] = step1[3] - step1[4]; - step2[5] = step1[2] - step1[5]; - step2[6] = step1[1] - step1[6]; - step2[7] = step1[0] - step1[7]; + step2[0] = WRAPLOW(step1[0] + step1[7], 8); + step2[1] = WRAPLOW(step1[1] + step1[6], 8); + step2[2] = WRAPLOW(step1[2] + step1[5], 8); + step2[3] = WRAPLOW(step1[3] + step1[4], 8); + step2[4] = WRAPLOW(step1[3] - step1[4], 8); + step2[5] = WRAPLOW(step1[2] - step1[5], 8); + step2[6] = WRAPLOW(step1[1] - step1[6], 8); + step2[7] = WRAPLOW(step1[0] - step1[7], 8); step2[8] = step1[8]; step2[9] = step1[9]; temp1 = (-step1[10] + step1[13]) * cospi_16_64; temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = dct_const_round_shift(temp1); - step2[13] = dct_const_round_shift(temp2); + step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = (-step1[11] + step1[12]) * cospi_16_64; temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = dct_const_round_shift(temp1); - step2[12] = dct_const_round_shift(temp2); + step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); + step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); step2[14] = step1[14]; step2[15] = step1[15]; - step2[16] = step1[16] + step1[23]; - step2[17] = step1[17] + step1[22]; - step2[18] = step1[18] + step1[21]; - step2[19] = step1[19] + step1[20]; - step2[20] = step1[19] - step1[20]; - step2[21] = step1[18] - step1[21]; - step2[22] = step1[17] - step1[22]; - step2[23] = step1[16] - step1[23]; + step2[16] = WRAPLOW(step1[16] + step1[23], 8); + step2[17] = WRAPLOW(step1[17] + step1[22], 8); + step2[18] = WRAPLOW(step1[18] + step1[21], 8); + step2[19] = WRAPLOW(step1[19] + step1[20], 8); + step2[20] = WRAPLOW(step1[19] - step1[20], 8); + step2[21] = WRAPLOW(step1[18] - step1[21], 8); + step2[22] = WRAPLOW(step1[17] - step1[22], 8); + step2[23] = WRAPLOW(step1[16] - step1[23], 8); - step2[24] = -step1[24] + step1[31]; - step2[25] = -step1[25] + step1[30]; - step2[26] = -step1[26] + step1[29]; - step2[27] = -step1[27] + step1[28]; - step2[28] = step1[27] + step1[28]; - step2[29] = step1[26] + step1[29]; - step2[30] = step1[25] + step1[30]; - step2[31] = step1[24] + step1[31]; + step2[24] = WRAPLOW(-step1[24] + step1[31], 8); + step2[25] = WRAPLOW(-step1[25] + step1[30], 8); + step2[26] = WRAPLOW(-step1[26] + step1[29], 8); + step2[27] = WRAPLOW(-step1[27] + step1[28], 8); + step2[28] = WRAPLOW(step1[27] + step1[28], 8); + step2[29] = WRAPLOW(step1[26] + step1[29], 8); + step2[30] = WRAPLOW(step1[25] + step1[30], 8); + step2[31] = WRAPLOW(step1[24] + step1[31], 8); // stage 7 - step1[0] = step2[0] + step2[15]; - step1[1] = step2[1] + step2[14]; - step1[2] = step2[2] + step2[13]; - step1[3] = step2[3] + step2[12]; - step1[4] = step2[4] + step2[11]; - step1[5] = step2[5] + step2[10]; - step1[6] = step2[6] + step2[9]; - step1[7] = step2[7] + step2[8]; - step1[8] = step2[7] - step2[8]; - step1[9] = step2[6] - step2[9]; - step1[10] = step2[5] - step2[10]; - step1[11] = step2[4] - step2[11]; - step1[12] = step2[3] - step2[12]; - step1[13] = step2[2] - step2[13]; - step1[14] = step2[1] - step2[14]; - step1[15] = step2[0] - step2[15]; + step1[0] = WRAPLOW(step2[0] + step2[15], 8); + step1[1] = WRAPLOW(step2[1] + step2[14], 8); + step1[2] = WRAPLOW(step2[2] + step2[13], 8); + step1[3] = WRAPLOW(step2[3] + step2[12], 8); + step1[4] = WRAPLOW(step2[4] + step2[11], 8); + step1[5] = WRAPLOW(step2[5] + step2[10], 8); + step1[6] = WRAPLOW(step2[6] + step2[9], 8); + step1[7] = WRAPLOW(step2[7] + step2[8], 8); + step1[8] = WRAPLOW(step2[7] - step2[8], 8); + step1[9] = WRAPLOW(step2[6] - step2[9], 8); + step1[10] = WRAPLOW(step2[5] - step2[10], 8); + step1[11] = WRAPLOW(step2[4] - step2[11], 8); + step1[12] = WRAPLOW(step2[3] - step2[12], 8); + step1[13] = WRAPLOW(step2[2] - step2[13], 8); + step1[14] = WRAPLOW(step2[1] - step2[14], 8); + step1[15] = WRAPLOW(step2[0] - step2[15], 8); step1[16] = step2[16]; step1[17] = step2[17]; @@ -1224,58 +1243,58 @@ static void idct32(const tran_low_t *input, tran_low_t *output) { step1[19] = step2[19]; temp1 = (-step2[20] + step2[27]) * cospi_16_64; temp2 = (step2[20] + step2[27]) * cospi_16_64; - step1[20] = dct_const_round_shift(temp1); - step1[27] = dct_const_round_shift(temp2); + step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = (-step2[21] + step2[26]) * cospi_16_64; temp2 = (step2[21] + step2[26]) * cospi_16_64; - step1[21] = dct_const_round_shift(temp1); - step1[26] = dct_const_round_shift(temp2); + step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = (-step2[22] + step2[25]) * cospi_16_64; temp2 = (step2[22] + step2[25]) * cospi_16_64; - step1[22] = dct_const_round_shift(temp1); - step1[25] = dct_const_round_shift(temp2); + step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); temp1 = (-step2[23] + step2[24]) * cospi_16_64; temp2 = (step2[23] + step2[24]) * cospi_16_64; - step1[23] = dct_const_round_shift(temp1); - step1[24] = dct_const_round_shift(temp2); + step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8); + step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8); step1[28] = step2[28]; step1[29] = step2[29]; step1[30] = step2[30]; step1[31] = step2[31]; // final stage - output[0] = step1[0] + step1[31]; - output[1] = step1[1] + step1[30]; - output[2] = step1[2] + step1[29]; - output[3] = step1[3] + step1[28]; - output[4] = step1[4] + step1[27]; - output[5] = step1[5] + step1[26]; - output[6] = step1[6] + step1[25]; - output[7] = step1[7] + step1[24]; - output[8] = step1[8] + step1[23]; - output[9] = step1[9] + step1[22]; - output[10] = step1[10] + step1[21]; - output[11] = step1[11] + step1[20]; - output[12] = step1[12] + step1[19]; - output[13] = step1[13] + step1[18]; - output[14] = step1[14] + step1[17]; - output[15] = step1[15] + step1[16]; - output[16] = step1[15] - step1[16]; - output[17] = step1[14] - step1[17]; - output[18] = step1[13] - step1[18]; - output[19] = step1[12] - step1[19]; - output[20] = step1[11] - step1[20]; - output[21] = step1[10] - step1[21]; - output[22] = step1[9] - step1[22]; - output[23] = step1[8] - step1[23]; - output[24] = step1[7] - step1[24]; - output[25] = step1[6] - step1[25]; - output[26] = step1[5] - step1[26]; - output[27] = step1[4] - step1[27]; - output[28] = step1[3] - step1[28]; - output[29] = step1[2] - step1[29]; - output[30] = step1[1] - step1[30]; - output[31] = step1[0] - step1[31]; + output[0] = WRAPLOW(step1[0] + step1[31], 8); + output[1] = WRAPLOW(step1[1] + step1[30], 8); + output[2] = WRAPLOW(step1[2] + step1[29], 8); + output[3] = WRAPLOW(step1[3] + step1[28], 8); + output[4] = WRAPLOW(step1[4] + step1[27], 8); + output[5] = WRAPLOW(step1[5] + step1[26], 8); + output[6] = WRAPLOW(step1[6] + step1[25], 8); + output[7] = WRAPLOW(step1[7] + step1[24], 8); + output[8] = WRAPLOW(step1[8] + step1[23], 8); + output[9] = WRAPLOW(step1[9] + step1[22], 8); + output[10] = WRAPLOW(step1[10] + step1[21], 8); + output[11] = WRAPLOW(step1[11] + step1[20], 8); + output[12] = WRAPLOW(step1[12] + step1[19], 8); + output[13] = WRAPLOW(step1[13] + step1[18], 8); + output[14] = WRAPLOW(step1[14] + step1[17], 8); + output[15] = WRAPLOW(step1[15] + step1[16], 8); + output[16] = WRAPLOW(step1[15] - step1[16], 8); + output[17] = WRAPLOW(step1[14] - step1[17], 8); + output[18] = WRAPLOW(step1[13] - step1[18], 8); + output[19] = WRAPLOW(step1[12] - step1[19], 8); + output[20] = WRAPLOW(step1[11] - step1[20], 8); + output[21] = WRAPLOW(step1[10] - step1[21], 8); + output[22] = WRAPLOW(step1[9] - step1[22], 8); + output[23] = WRAPLOW(step1[8] - step1[23], 8); + output[24] = WRAPLOW(step1[7] - step1[24], 8); + output[25] = WRAPLOW(step1[6] - step1[25], 8); + output[26] = WRAPLOW(step1[5] - step1[26], 8); + output[27] = WRAPLOW(step1[4] - step1[27], 8); + output[28] = WRAPLOW(step1[3] - step1[28], 8); + output[29] = WRAPLOW(step1[2] - step1[29], 8); + output[30] = WRAPLOW(step1[1] - step1[30], 8); + output[31] = WRAPLOW(step1[0] - step1[31], 8); } void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, @@ -1310,9 +1329,10 @@ void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; idct32(temp_in, temp_out); - for (j = 0; j < 32; ++j) - dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) - + dest[j * stride + i]); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = clip_pixel_add(ROUND_POWER_OF_TWO(temp_out[j], 6), + dest[j * stride + i]); + } } } @@ -1336,9 +1356,10 @@ void vp9_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; idct32(temp_in, temp_out); - for (j = 0; j < 32; ++j) - dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) - + dest[j * stride + i]); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = clip_pixel_add(ROUND_POWER_OF_TWO(temp_out[j], 6), + dest[j * stride + i]); + } } } @@ -1346,13 +1367,13 @@ void vp9_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; tran_high_t a1; - tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 32; ++j) { for (i = 0; i < 32; ++i) - dest[i] = clip_pixel(dest[i] + a1); + dest[i] = clip_pixel_add(dest[i], a1); dest += stride; } } @@ -1468,10 +1489,10 @@ void vp9_high_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, c1 = e1 - c1; a1 -= b1; d1 += c1; - op[0] = WRAPLOW(a1); - op[1] = WRAPLOW(b1); - op[2] = WRAPLOW(c1); - op[3] = WRAPLOW(d1); + op[0] = WRAPLOW(a1, bd); + op[1] = WRAPLOW(b1, bd); + op[2] = WRAPLOW(c1, bd); + op[3] = WRAPLOW(d1, bd); ip += 4; op += 4; } @@ -1489,37 +1510,16 @@ void vp9_high_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, c1 = e1 - c1; a1 -= b1; d1 += c1; - dest[stride * 0] = clip_pixel_bd_high(dest[stride * 0], a1, bd); - dest[stride * 1] = clip_pixel_bd_high(dest[stride * 1], b1, bd); - dest[stride * 2] = clip_pixel_bd_high(dest[stride * 2], c1, bd); - dest[stride * 3] = clip_pixel_bd_high(dest[stride * 3], d1, bd); + dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd); + dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd); + dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd); + dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd); ip++; dest++; } } -static void high_idct4(const tran_low_t *input, tran_low_t *output, int bd) { - tran_low_t step[4]; - tran_high_t temp1, temp2; - (void) bd; - // stage 1 - temp1 = (input[0] + input[2]) * cospi_16_64; - temp2 = (input[0] - input[2]) * cospi_16_64; - step[0] = WRAPLOW(dct_const_round_shift(temp1)); - step[1] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; - temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step[2] = WRAPLOW(dct_const_round_shift(temp1)); - step[3] = WRAPLOW(dct_const_round_shift(temp2)); - - // stage 2 - output[0] = WRAPLOW(step[0] + step[3]); - output[1] = WRAPLOW(step[1] + step[2]); - output[2] = WRAPLOW(step[1] - step[2]); - output[3] = WRAPLOW(step[0] - step[3]); -} - void vp9_high_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, int dest_stride, int bd) { int i; @@ -1533,22 +1533,47 @@ void vp9_high_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, a1 = ip[0] >> UNIT_QUANT_SHIFT; e1 = a1 >> 1; a1 -= e1; - op[0] = WRAPLOW(a1); - op[1] = op[2] = op[3] = WRAPLOW(e1); + op[0] = WRAPLOW(a1, bd); + op[1] = op[2] = op[3] = WRAPLOW(e1, bd); ip = tmp; for (i = 0; i < 4; i++) { e1 = ip[0] >> 1; a1 = ip[0] - e1; - dest[dest_stride * 0] = clip_pixel_bd_high(dest[dest_stride * 0], a1, bd); - dest[dest_stride * 1] = clip_pixel_bd_high(dest[dest_stride * 1], e1, bd); - dest[dest_stride * 2] = clip_pixel_bd_high(dest[dest_stride * 2], e1, bd); - dest[dest_stride * 3] = clip_pixel_bd_high(dest[dest_stride * 3], e1, bd); + dest[dest_stride * 0] = highbd_clip_pixel_add( + dest[dest_stride * 0], a1, bd); + dest[dest_stride * 1] = highbd_clip_pixel_add( + dest[dest_stride * 1], e1, bd); + dest[dest_stride * 2] = highbd_clip_pixel_add( + dest[dest_stride * 2], e1, bd); + dest[dest_stride * 3] = highbd_clip_pixel_add( + dest[dest_stride * 3], e1, bd); ip++; dest++; } } +static void highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step[4]; + tran_high_t temp1, temp2; + (void) bd; + // stage 1 + temp1 = (input[0] + input[2]) * cospi_16_64; + temp2 = (input[0] - input[2]) * cospi_16_64; + step[0] = WRAPLOW(dct_const_round_shift(temp1), bd); + step[1] = WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step[2] = WRAPLOW(dct_const_round_shift(temp1), bd); + step[3] = WRAPLOW(dct_const_round_shift(temp2), bd); + + // stage 2 + output[0] = WRAPLOW(step[0] + step[3], bd); + output[1] = WRAPLOW(step[1] + step[2], bd); + output[2] = WRAPLOW(step[1] - step[2], bd); + output[3] = WRAPLOW(step[0] - step[3], bd); +} + void vp9_high_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { tran_low_t out[4 * 4]; @@ -1559,7 +1584,7 @@ void vp9_high_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, // Rows for (i = 0; i < 4; ++i) { - high_idct4(input, outptr, bd); + highbd_idct4(input, outptr, bd); input += 4; outptr += 4; } @@ -1568,10 +1593,11 @@ void vp9_high_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - high_idct4(temp_in, temp_out, bd); - for (j = 0; j < 4; ++j) - dest[j * stride + i] = clip_pixel_bd_high( + highbd_idct4(temp_in, temp_out, bd); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); + } } } @@ -1579,22 +1605,22 @@ void vp9_high_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, int dest_stride, int bd) { int i; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 4); for (i = 0; i < 4; i++) { - dest[0] = clip_pixel_bd_high(dest[0], a1, bd); - dest[1] = clip_pixel_bd_high(dest[1], a1, bd); - dest[2] = clip_pixel_bd_high(dest[2], a1, bd); - dest[3] = clip_pixel_bd_high(dest[3], a1, bd); + dest[0] = highbd_clip_pixel_add(dest[0], a1, bd); + dest[1] = highbd_clip_pixel_add(dest[1], a1, bd); + dest[2] = highbd_clip_pixel_add(dest[2], a1, bd); + dest[3] = highbd_clip_pixel_add(dest[3], a1, bd); dest += dest_stride; } } -static void high_idct8(const tran_low_t *input, tran_low_t *output, int bd) { +static void highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) { tran_low_t step1[8], step2[8]; tran_high_t temp1, temp2; // stage 1 @@ -1604,39 +1630,39 @@ static void high_idct8(const tran_low_t *input, tran_low_t *output, int bd) { step1[3] = input[6]; temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1)); - step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd); // stage 2 & stage 3 - even half - high_idct4(step1, step1, bd); + highbd_idct4(step1, step1, bd); // stage 2 - odd half - step2[4] = WRAPLOW(step1[4] + step1[5]); - step2[5] = WRAPLOW(step1[4] - step1[5]); - step2[6] = WRAPLOW(-step1[6] + step1[7]); - step2[7] = WRAPLOW(step1[6] + step1[7]); + step2[4] = WRAPLOW(step1[4] + step1[5], bd); + step2[5] = WRAPLOW(step1[4] - step1[5], bd); + step2[6] = WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = WRAPLOW(step1[6] + step1[7], bd); // stage 3 - odd half step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd); step1[7] = step2[7]; // stage 4 - output[0] = WRAPLOW(step1[0] + step1[7]); - output[1] = WRAPLOW(step1[1] + step1[6]); - output[2] = WRAPLOW(step1[2] + step1[5]); - output[3] = WRAPLOW(step1[3] + step1[4]); - output[4] = WRAPLOW(step1[3] - step1[4]); - output[5] = WRAPLOW(step1[2] - step1[5]); - output[6] = WRAPLOW(step1[1] - step1[6]); - output[7] = WRAPLOW(step1[0] - step1[7]); + output[0] = WRAPLOW(step1[0] + step1[7], bd); + output[1] = WRAPLOW(step1[1] + step1[6], bd); + output[2] = WRAPLOW(step1[2] + step1[5], bd); + output[3] = WRAPLOW(step1[3] + step1[4], bd); + output[4] = WRAPLOW(step1[3] - step1[4], bd); + output[5] = WRAPLOW(step1[2] - step1[5], bd); + output[6] = WRAPLOW(step1[1] - step1[6], bd); + output[7] = WRAPLOW(step1[0] - step1[7], bd); } void vp9_high_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, @@ -1649,7 +1675,7 @@ void vp9_high_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, // First transform rows. for (i = 0; i < 8; ++i) { - high_idct8(input, outptr, bd); + highbd_idct8(input, outptr, bd); input += 8; outptr += 8; } @@ -1658,11 +1684,11 @@ void vp9_high_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, for (i = 0; i < 8; ++i) { for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - high_idct8(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) - dest[j * stride + i] = clip_pixel_bd_high(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 5), - bd); + highbd_idct8(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } } } @@ -1670,18 +1696,18 @@ void vp9_high_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 5); for (j = 0; j < 8; ++j) { for (i = 0; i < 8; ++i) - dest[i] = clip_pixel_bd_high(dest[i], a1, bd); + dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); dest += stride; } } -static void high_iadst4(const tran_low_t *input, tran_low_t *output, int bd) { +static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) { tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; tran_high_t x0 = input[0]; @@ -1718,19 +1744,19 @@ static void high_iadst4(const tran_low_t *input, tran_low_t *output, int bd) { // The overall dynamic range is 14b (input) + 14b (multiplication scaling) // + 1b (addition) = 29b. // Hence the output bit depth is 15b. - output[0] = WRAPLOW(dct_const_round_shift(s0)); - output[1] = WRAPLOW(dct_const_round_shift(s1)); - output[2] = WRAPLOW(dct_const_round_shift(s2)); - output[3] = WRAPLOW(dct_const_round_shift(s3)); + output[0] = WRAPLOW(dct_const_round_shift(s0), bd); + output[1] = WRAPLOW(dct_const_round_shift(s1), bd); + output[2] = WRAPLOW(dct_const_round_shift(s2), bd); + output[3] = WRAPLOW(dct_const_round_shift(s3), bd); } void vp9_high_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int tx_type, int bd) { const high_transform_2d IHT_4[] = { - { high_idct4, high_idct4 }, // DCT_DCT = 0 - { high_iadst4, high_idct4 }, // ADST_DCT = 1 - { high_idct4, high_iadst4 }, // DCT_ADST = 2 - { high_iadst4, high_iadst4 } // ADST_ADST = 3 + { highbd_idct4, highbd_idct4 }, // DCT_DCT = 0 + { highbd_iadst4, highbd_idct4 }, // ADST_DCT = 1 + { highbd_idct4, highbd_iadst4 }, // DCT_ADST = 2 + { highbd_iadst4, highbd_iadst4 } // ADST_ADST = 3 }; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); @@ -1751,13 +1777,14 @@ void vp9_high_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; IHT_4[tx_type].cols(temp_in, temp_out, bd); - for (j = 0; j < 4; ++j) - dest[j * stride + i] = clip_pixel_bd_high( + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); + } } } -static void high_iadst8(const tran_low_t *input, tran_low_t *output, int bd) { +static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) { tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; tran_high_t x0 = input[7]; @@ -1785,14 +1812,14 @@ static void high_iadst8(const tran_low_t *input, tran_low_t *output, int bd) { s6 = cospi_26_64 * x6 + cospi_6_64 * x7; s7 = cospi_6_64 * x6 - cospi_26_64 * x7; - x0 = WRAPLOW(dct_const_round_shift(s0 + s4)); - x1 = WRAPLOW(dct_const_round_shift(s1 + s5)); - x2 = WRAPLOW(dct_const_round_shift(s2 + s6)); - x3 = WRAPLOW(dct_const_round_shift(s3 + s7)); - x4 = WRAPLOW(dct_const_round_shift(s0 - s4)); - x5 = WRAPLOW(dct_const_round_shift(s1 - s5)); - x6 = WRAPLOW(dct_const_round_shift(s2 - s6)); - x7 = WRAPLOW(dct_const_round_shift(s3 - s7)); + x0 = WRAPLOW(dct_const_round_shift(s0 + s4), bd); + x1 = WRAPLOW(dct_const_round_shift(s1 + s5), bd); + x2 = WRAPLOW(dct_const_round_shift(s2 + s6), bd); + x3 = WRAPLOW(dct_const_round_shift(s3 + s7), bd); + x4 = WRAPLOW(dct_const_round_shift(s0 - s4), bd); + x5 = WRAPLOW(dct_const_round_shift(s1 - s5), bd); + x6 = WRAPLOW(dct_const_round_shift(s2 - s6), bd); + x7 = WRAPLOW(dct_const_round_shift(s3 - s7), bd); // stage 2 s0 = x0; @@ -1804,14 +1831,14 @@ static void high_iadst8(const tran_low_t *input, tran_low_t *output, int bd) { s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; s7 = cospi_8_64 * x6 + cospi_24_64 * x7; - x0 = s0 + s2; - x1 = s1 + s3; - x2 = s0 - s2; - x3 = s1 - s3; - x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); - x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); - x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); - x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); + x0 = WRAPLOW(s0 + s2, bd); + x1 = WRAPLOW(s1 + s3, bd); + x2 = WRAPLOW(s0 - s2, bd); + x3 = WRAPLOW(s1 - s3, bd); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd); // stage 3 s2 = cospi_16_64 * (x2 + x3); @@ -1819,26 +1846,26 @@ static void high_iadst8(const tran_low_t *input, tran_low_t *output, int bd) { s6 = cospi_16_64 * (x6 + x7); s7 = cospi_16_64 * (x6 - x7); - x2 = WRAPLOW(dct_const_round_shift(s2)); - x3 = WRAPLOW(dct_const_round_shift(s3)); - x6 = WRAPLOW(dct_const_round_shift(s6)); - x7 = WRAPLOW(dct_const_round_shift(s7)); + x2 = WRAPLOW(dct_const_round_shift(s2), bd); + x3 = WRAPLOW(dct_const_round_shift(s3), bd); + x6 = WRAPLOW(dct_const_round_shift(s6), bd); + x7 = WRAPLOW(dct_const_round_shift(s7), bd); - output[0] = WRAPLOW(x0); - output[1] = WRAPLOW(-x4); - output[2] = WRAPLOW(x6); - output[3] = WRAPLOW(-x2); - output[4] = WRAPLOW(x3); - output[5] = WRAPLOW(-x7); - output[6] = WRAPLOW(x5); - output[7] = WRAPLOW(-x1); + output[0] = WRAPLOW(x0, bd); + output[1] = WRAPLOW(-x4, bd); + output[2] = WRAPLOW(x6, bd); + output[3] = WRAPLOW(-x2, bd); + output[4] = WRAPLOW(x3, bd); + output[5] = WRAPLOW(-x7, bd); + output[6] = WRAPLOW(x5, bd); + output[7] = WRAPLOW(-x1, bd); } static const high_transform_2d HIGH_IHT_8[] = { - { high_idct8, high_idct8 }, // DCT_DCT = 0 - { high_iadst8, high_idct8 }, // ADST_DCT = 1 - { high_idct8, high_iadst8 }, // DCT_ADST = 2 - { high_iadst8, high_iadst8 } // ADST_ADST = 3 + { highbd_idct8, highbd_idct8 }, // DCT_DCT = 0 + { highbd_iadst8, highbd_idct8 }, // ADST_DCT = 1 + { highbd_idct8, highbd_iadst8 }, // DCT_ADST = 2 + { highbd_iadst8, highbd_iadst8 } // ADST_ADST = 3 }; void vp9_high_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, @@ -1862,9 +1889,10 @@ void vp9_high_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; ht.cols(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) - dest[j * stride + i] = clip_pixel_bd_high( + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } } } @@ -1879,7 +1907,7 @@ void vp9_high_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8, // First transform rows. // Only first 4 row has non-zero coefs. for (i = 0; i < 4; ++i) { - high_idct8(input, outptr, bd); + highbd_idct8(input, outptr, bd); input += 8; outptr += 8; } @@ -1887,14 +1915,15 @@ void vp9_high_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8, for (i = 0; i < 8; ++i) { for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - high_idct8(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) - dest[j * stride + i] = clip_pixel_bd_high( + highbd_idct8(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } } } -static void high_idct16(const tran_low_t *input, tran_low_t *output, int bd) { +static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) { tran_low_t step1[16], step2[16]; tran_high_t temp1, temp2; (void) bd; @@ -1929,23 +1958,23 @@ static void high_idct16(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(dct_const_round_shift(temp1)); - step2[15] = WRAPLOW(dct_const_round_shift(temp2)); + step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd); + step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1)); - step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd); + step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1)); - step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd); + step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd); // stage 3 step1[0] = step2[0]; @@ -1955,109 +1984,109 @@ static void high_idct16(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1)); - step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd); - step1[8] = WRAPLOW(step2[8] + step2[9]); - step1[9] = WRAPLOW(step2[8] - step2[9]); - step1[10] = WRAPLOW(-step2[10] + step2[11]); - step1[11] = WRAPLOW(step2[10] + step2[11]); - step1[12] = WRAPLOW(step2[12] + step2[13]); - step1[13] = WRAPLOW(step2[12] - step2[13]); - step1[14] = WRAPLOW(-step2[14] + step2[15]); - step1[15] = WRAPLOW(step2[14] + step2[15]); + step1[8] = WRAPLOW(step2[8] + step2[9], bd); + step1[9] = WRAPLOW(step2[8] - step2[9], bd); + step1[10] = WRAPLOW(-step2[10] + step2[11], bd); + step1[11] = WRAPLOW(step2[10] + step2[11], bd); + step1[12] = WRAPLOW(step2[12] + step2[13], bd); + step1[13] = WRAPLOW(step2[12] - step2[13], bd); + step1[14] = WRAPLOW(-step2[14] + step2[15], bd); + step1[15] = WRAPLOW(step2[14] + step2[15], bd); // stage 4 temp1 = (step1[0] + step1[1]) * cospi_16_64; temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1)); - step2[1] = WRAPLOW(dct_const_round_shift(temp2)); + step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd); + step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1)); - step2[3] = WRAPLOW(dct_const_round_shift(temp2)); - step2[4] = WRAPLOW(step1[4] + step1[5]); - step2[5] = WRAPLOW(step1[4] - step1[5]); - step2[6] = WRAPLOW(-step1[6] + step1[7]); - step2[7] = WRAPLOW(step1[6] + step1[7]); + step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd); + step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[4] = WRAPLOW(step1[4] + step1[5], bd); + step2[5] = WRAPLOW(step1[4] - step1[5], bd); + step2[6] = WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = WRAPLOW(step1[6] + step1[7], bd); step2[8] = step1[8]; step2[15] = step1[15]; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1)); - step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd); + step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd); step2[11] = step1[11]; step2[12] = step1[12]; // stage 5 - step1[0] = WRAPLOW(step2[0] + step2[3]); - step1[1] = WRAPLOW(step2[1] + step2[2]); - step1[2] = WRAPLOW(step2[1] - step2[2]); - step1[3] = WRAPLOW(step2[0] - step2[3]); + step1[0] = WRAPLOW(step2[0] + step2[3], bd); + step1[1] = WRAPLOW(step2[1] + step2[2], bd); + step1[2] = WRAPLOW(step2[1] - step2[2], bd); + step1[3] = WRAPLOW(step2[0] - step2[3], bd); step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd); step1[7] = step2[7]; - step1[8] = WRAPLOW(step2[8] + step2[11]); - step1[9] = WRAPLOW(step2[9] + step2[10]); - step1[10] = WRAPLOW(step2[9] - step2[10]); - step1[11] = WRAPLOW(step2[8] - step2[11]); - step1[12] = WRAPLOW(-step2[12] + step2[15]); - step1[13] = WRAPLOW(-step2[13] + step2[14]); - step1[14] = WRAPLOW(step2[13] + step2[14]); - step1[15] = WRAPLOW(step2[12] + step2[15]); + step1[8] = WRAPLOW(step2[8] + step2[11], bd); + step1[9] = WRAPLOW(step2[9] + step2[10], bd); + step1[10] = WRAPLOW(step2[9] - step2[10], bd); + step1[11] = WRAPLOW(step2[8] - step2[11], bd); + step1[12] = WRAPLOW(-step2[12] + step2[15], bd); + step1[13] = WRAPLOW(-step2[13] + step2[14], bd); + step1[14] = WRAPLOW(step2[13] + step2[14], bd); + step1[15] = WRAPLOW(step2[12] + step2[15], bd); // stage 6 - step2[0] = WRAPLOW(step1[0] + step1[7]); - step2[1] = WRAPLOW(step1[1] + step1[6]); - step2[2] = WRAPLOW(step1[2] + step1[5]); - step2[3] = WRAPLOW(step1[3] + step1[4]); - step2[4] = WRAPLOW(step1[3] - step1[4]); - step2[5] = WRAPLOW(step1[2] - step1[5]); - step2[6] = WRAPLOW(step1[1] - step1[6]); - step2[7] = WRAPLOW(step1[0] - step1[7]); + step2[0] = WRAPLOW(step1[0] + step1[7], bd); + step2[1] = WRAPLOW(step1[1] + step1[6], bd); + step2[2] = WRAPLOW(step1[2] + step1[5], bd); + step2[3] = WRAPLOW(step1[3] + step1[4], bd); + step2[4] = WRAPLOW(step1[3] - step1[4], bd); + step2[5] = WRAPLOW(step1[2] - step1[5], bd); + step2[6] = WRAPLOW(step1[1] - step1[6], bd); + step2[7] = WRAPLOW(step1[0] - step1[7], bd); step2[8] = step1[8]; step2[9] = step1[9]; temp1 = (-step1[10] + step1[13]) * cospi_16_64; temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = (-step1[11] + step1[12]) * cospi_16_64; temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1)); - step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd); + step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd); step2[14] = step1[14]; step2[15] = step1[15]; // stage 7 - output[0] = WRAPLOW(step2[0] + step2[15]); - output[1] = WRAPLOW(step2[1] + step2[14]); - output[2] = WRAPLOW(step2[2] + step2[13]); - output[3] = WRAPLOW(step2[3] + step2[12]); - output[4] = WRAPLOW(step2[4] + step2[11]); - output[5] = WRAPLOW(step2[5] + step2[10]); - output[6] = WRAPLOW(step2[6] + step2[9]); - output[7] = WRAPLOW(step2[7] + step2[8]); - output[8] = WRAPLOW(step2[7] - step2[8]); - output[9] = WRAPLOW(step2[6] - step2[9]); - output[10] = WRAPLOW(step2[5] - step2[10]); - output[11] = WRAPLOW(step2[4] - step2[11]); - output[12] = WRAPLOW(step2[3] - step2[12]); - output[13] = WRAPLOW(step2[2] - step2[13]); - output[14] = WRAPLOW(step2[1] - step2[14]); - output[15] = WRAPLOW(step2[0] - step2[15]); + output[0] = WRAPLOW(step2[0] + step2[15], bd); + output[1] = WRAPLOW(step2[1] + step2[14], bd); + output[2] = WRAPLOW(step2[2] + step2[13], bd); + output[3] = WRAPLOW(step2[3] + step2[12], bd); + output[4] = WRAPLOW(step2[4] + step2[11], bd); + output[5] = WRAPLOW(step2[5] + step2[10], bd); + output[6] = WRAPLOW(step2[6] + step2[9], bd); + output[7] = WRAPLOW(step2[7] + step2[8], bd); + output[8] = WRAPLOW(step2[7] - step2[8], bd); + output[9] = WRAPLOW(step2[6] - step2[9], bd); + output[10] = WRAPLOW(step2[5] - step2[10], bd); + output[11] = WRAPLOW(step2[4] - step2[11], bd); + output[12] = WRAPLOW(step2[3] - step2[12], bd); + output[13] = WRAPLOW(step2[2] - step2[13], bd); + output[14] = WRAPLOW(step2[1] - step2[14], bd); + output[15] = WRAPLOW(step2[0] - step2[15], bd); } void vp9_high_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, @@ -2070,7 +2099,7 @@ void vp9_high_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, // First transform rows. for (i = 0; i < 16; ++i) { - high_idct16(input, outptr, bd); + highbd_idct16(input, outptr, bd); input += 16; outptr += 16; } @@ -2079,14 +2108,16 @@ void vp9_high_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - high_idct16(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) - dest[j * stride + i] = clip_pixel_bd_high( + highbd_idct16(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } } } -static void high_iadst16(const tran_low_t *input, tran_low_t *output, int bd) { +static void highbd_iadst16(const tran_low_t *input, tran_low_t *output, + int bd) { tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; tran_high_t s9, s10, s11, s12, s13, s14, s15; @@ -2132,22 +2163,22 @@ static void high_iadst16(const tran_low_t *input, tran_low_t *output, int bd) { s14 = x14 * cospi_29_64 + x15 * cospi_3_64; s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - x0 = WRAPLOW(dct_const_round_shift(s0 + s8)); - x1 = WRAPLOW(dct_const_round_shift(s1 + s9)); - x2 = WRAPLOW(dct_const_round_shift(s2 + s10)); - x3 = WRAPLOW(dct_const_round_shift(s3 + s11)); - x4 = WRAPLOW(dct_const_round_shift(s4 + s12)); - x5 = WRAPLOW(dct_const_round_shift(s5 + s13)); - x6 = WRAPLOW(dct_const_round_shift(s6 + s14)); - x7 = WRAPLOW(dct_const_round_shift(s7 + s15)); - x8 = WRAPLOW(dct_const_round_shift(s0 - s8)); - x9 = WRAPLOW(dct_const_round_shift(s1 - s9)); - x10 = WRAPLOW(dct_const_round_shift(s2 - s10)); - x11 = WRAPLOW(dct_const_round_shift(s3 - s11)); - x12 = WRAPLOW(dct_const_round_shift(s4 - s12)); - x13 = WRAPLOW(dct_const_round_shift(s5 - s13)); - x14 = WRAPLOW(dct_const_round_shift(s6 - s14)); - x15 = WRAPLOW(dct_const_round_shift(s7 - s15)); + x0 = WRAPLOW(dct_const_round_shift(s0 + s8), bd); + x1 = WRAPLOW(dct_const_round_shift(s1 + s9), bd); + x2 = WRAPLOW(dct_const_round_shift(s2 + s10), bd); + x3 = WRAPLOW(dct_const_round_shift(s3 + s11), bd); + x4 = WRAPLOW(dct_const_round_shift(s4 + s12), bd); + x5 = WRAPLOW(dct_const_round_shift(s5 + s13), bd); + x6 = WRAPLOW(dct_const_round_shift(s6 + s14), bd); + x7 = WRAPLOW(dct_const_round_shift(s7 + s15), bd); + x8 = WRAPLOW(dct_const_round_shift(s0 - s8), bd); + x9 = WRAPLOW(dct_const_round_shift(s1 - s9), bd); + x10 = WRAPLOW(dct_const_round_shift(s2 - s10), bd); + x11 = WRAPLOW(dct_const_round_shift(s3 - s11), bd); + x12 = WRAPLOW(dct_const_round_shift(s4 - s12), bd); + x13 = WRAPLOW(dct_const_round_shift(s5 - s13), bd); + x14 = WRAPLOW(dct_const_round_shift(s6 - s14), bd); + x15 = WRAPLOW(dct_const_round_shift(s7 - s15), bd); // stage 2 s0 = x0; @@ -2167,22 +2198,22 @@ static void high_iadst16(const tran_low_t *input, tran_low_t *output, int bd) { s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - x0 = WRAPLOW(s0 + s4); - x1 = WRAPLOW(s1 + s5); - x2 = WRAPLOW(s2 + s6); - x3 = WRAPLOW(s3 + s7); - x4 = WRAPLOW(s0 - s4); - x5 = WRAPLOW(s1 - s5); - x6 = WRAPLOW(s2 - s6); - x7 = WRAPLOW(s3 - s7); - x8 = WRAPLOW(dct_const_round_shift(s8 + s12)); - x9 = WRAPLOW(dct_const_round_shift(s9 + s13)); - x10 = WRAPLOW(dct_const_round_shift(s10 + s14)); - x11 = WRAPLOW(dct_const_round_shift(s11 + s15)); - x12 = WRAPLOW(dct_const_round_shift(s8 - s12)); - x13 = WRAPLOW(dct_const_round_shift(s9 - s13)); - x14 = WRAPLOW(dct_const_round_shift(s10 - s14)); - x15 = WRAPLOW(dct_const_round_shift(s11 - s15)); + x0 = WRAPLOW(s0 + s4, bd); + x1 = WRAPLOW(s1 + s5, bd); + x2 = WRAPLOW(s2 + s6, bd); + x3 = WRAPLOW(s3 + s7, bd); + x4 = WRAPLOW(s0 - s4, bd); + x5 = WRAPLOW(s1 - s5, bd); + x6 = WRAPLOW(s2 - s6, bd); + x7 = WRAPLOW(s3 - s7, bd); + x8 = WRAPLOW(dct_const_round_shift(s8 + s12), bd); + x9 = WRAPLOW(dct_const_round_shift(s9 + s13), bd); + x10 = WRAPLOW(dct_const_round_shift(s10 + s14), bd); + x11 = WRAPLOW(dct_const_round_shift(s11 + s15), bd); + x12 = WRAPLOW(dct_const_round_shift(s8 - s12), bd); + x13 = WRAPLOW(dct_const_round_shift(s9 - s13), bd); + x14 = WRAPLOW(dct_const_round_shift(s10 - s14), bd); + x15 = WRAPLOW(dct_const_round_shift(s11 - s15), bd); // stage 3 s0 = x0; @@ -2202,22 +2233,22 @@ static void high_iadst16(const tran_low_t *input, tran_low_t *output, int bd) { s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - x0 = WRAPLOW(s0 + s2); - x1 = WRAPLOW(s1 + s3); - x2 = WRAPLOW(s0 - s2); - x3 = WRAPLOW(s1 - s3); - x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); - x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); - x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); - x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); - x8 = WRAPLOW(s8 + s10); - x9 = WRAPLOW(s9 + s11); - x10 = WRAPLOW(s8 - s10); - x11 = WRAPLOW(s9 - s11); - x12 = WRAPLOW(dct_const_round_shift(s12 + s14)); - x13 = WRAPLOW(dct_const_round_shift(s13 + s15)); - x14 = WRAPLOW(dct_const_round_shift(s12 - s14)); - x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); + x0 = WRAPLOW(s0 + s2, bd); + x1 = WRAPLOW(s1 + s3, bd); + x2 = WRAPLOW(s0 - s2, bd); + x3 = WRAPLOW(s1 - s3, bd); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd); + x8 = WRAPLOW(s8 + s10, bd); + x9 = WRAPLOW(s9 + s11, bd); + x10 = WRAPLOW(s8 - s10, bd); + x11 = WRAPLOW(s9 - s11, bd); + x12 = WRAPLOW(dct_const_round_shift(s12 + s14), bd); + x13 = WRAPLOW(dct_const_round_shift(s13 + s15), bd); + x14 = WRAPLOW(dct_const_round_shift(s12 - s14), bd); + x15 = WRAPLOW(dct_const_round_shift(s13 - s15), bd); // stage 4 s2 = (- cospi_16_64) * (x2 + x3); @@ -2229,38 +2260,38 @@ static void high_iadst16(const tran_low_t *input, tran_low_t *output, int bd) { s14 = (- cospi_16_64) * (x14 + x15); s15 = cospi_16_64 * (x14 - x15); - x2 = WRAPLOW(dct_const_round_shift(s2)); - x3 = WRAPLOW(dct_const_round_shift(s3)); - x6 = WRAPLOW(dct_const_round_shift(s6)); - x7 = WRAPLOW(dct_const_round_shift(s7)); - x10 = WRAPLOW(dct_const_round_shift(s10)); - x11 = WRAPLOW(dct_const_round_shift(s11)); - x14 = WRAPLOW(dct_const_round_shift(s14)); - x15 = WRAPLOW(dct_const_round_shift(s15)); + x2 = WRAPLOW(dct_const_round_shift(s2), bd); + x3 = WRAPLOW(dct_const_round_shift(s3), bd); + x6 = WRAPLOW(dct_const_round_shift(s6), bd); + x7 = WRAPLOW(dct_const_round_shift(s7), bd); + x10 = WRAPLOW(dct_const_round_shift(s10), bd); + x11 = WRAPLOW(dct_const_round_shift(s11), bd); + x14 = WRAPLOW(dct_const_round_shift(s14), bd); + x15 = WRAPLOW(dct_const_round_shift(s15), bd); - output[0] = WRAPLOW(x0); - output[1] = WRAPLOW(-x8); - output[2] = WRAPLOW(x12); - output[3] = WRAPLOW(-x4); - output[4] = WRAPLOW(x6); - output[5] = WRAPLOW(x14); - output[6] = WRAPLOW(x10); - output[7] = WRAPLOW(x2); - output[8] = WRAPLOW(x3); - output[9] = WRAPLOW(x11); - output[10] = WRAPLOW(x15); - output[11] = WRAPLOW(x7); - output[12] = WRAPLOW(x5); - output[13] = WRAPLOW(-x13); - output[14] = WRAPLOW(x9); - output[15] = WRAPLOW(-x1); + output[0] = WRAPLOW(x0, bd); + output[1] = WRAPLOW(-x8, bd); + output[2] = WRAPLOW(x12, bd); + output[3] = WRAPLOW(-x4, bd); + output[4] = WRAPLOW(x6, bd); + output[5] = WRAPLOW(x14, bd); + output[6] = WRAPLOW(x10, bd); + output[7] = WRAPLOW(x2, bd); + output[8] = WRAPLOW(x3, bd); + output[9] = WRAPLOW(x11, bd); + output[10] = WRAPLOW(x15, bd); + output[11] = WRAPLOW(x7, bd); + output[12] = WRAPLOW(x5, bd); + output[13] = WRAPLOW(-x13, bd); + output[14] = WRAPLOW(x9, bd); + output[15] = WRAPLOW(-x1, bd); } static const high_transform_2d HIGH_IHT_16[] = { - { high_idct16, high_idct16 }, // DCT_DCT = 0 - { high_iadst16, high_idct16 }, // ADST_DCT = 1 - { high_idct16, high_iadst16 }, // DCT_ADST = 2 - { high_iadst16, high_iadst16 } // ADST_ADST = 3 + { highbd_idct16, highbd_idct16 }, // DCT_DCT = 0 + { highbd_iadst16, highbd_idct16 }, // ADST_DCT = 1 + { highbd_idct16, highbd_iadst16 }, // DCT_ADST = 2 + { highbd_iadst16, highbd_iadst16 } // ADST_ADST = 3 }; void vp9_high_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, @@ -2284,9 +2315,10 @@ void vp9_high_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; ht.cols(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) - dest[j * stride + i] = clip_pixel_bd_high( + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } } } @@ -2301,7 +2333,7 @@ void vp9_high_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, // First transform rows. Since all non-zero dct coefficients are in // upper-left 4x4 area, we only need to calculate first 4 rows here. for (i = 0; i < 4; ++i) { - high_idct16(input, outptr, bd); + highbd_idct16(input, outptr, bd); input += 16; outptr += 16; } @@ -2310,10 +2342,11 @@ void vp9_high_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) temp_in[j] = out[j*16 + i]; - high_idct16(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) - dest[j * stride + i] = clip_pixel_bd_high( + highbd_idct16(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } } } @@ -2321,19 +2354,19 @@ void vp9_high_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 16; ++j) { for (i = 0; i < 16; ++i) - dest[i] = clip_pixel_bd_high(dest[i], a1, bd); + dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); dest += stride; } } -static void high_idct32(const tran_low_t *input, tran_low_t *output, int bd) { +static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { tran_low_t step1[32], step2[32]; tran_high_t temp1, temp2; (void) bd; @@ -2358,43 +2391,43 @@ static void high_idct32(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; - step1[16] = WRAPLOW(dct_const_round_shift(temp1)); - step1[31] = WRAPLOW(dct_const_round_shift(temp2)); + step1[16] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[31] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; - step1[17] = WRAPLOW(dct_const_round_shift(temp1)); - step1[30] = WRAPLOW(dct_const_round_shift(temp2)); + step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1)); - step1[29] = WRAPLOW(dct_const_round_shift(temp2)); + step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; - step1[19] = WRAPLOW(dct_const_round_shift(temp1)); - step1[28] = WRAPLOW(dct_const_round_shift(temp2)); + step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1)); - step1[27] = WRAPLOW(dct_const_round_shift(temp2)); + step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1)); - step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1)); - step1[25] = WRAPLOW(dct_const_round_shift(temp2)); + step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; - step1[23] = WRAPLOW(dct_const_round_shift(temp1)); - step1[24] = WRAPLOW(dct_const_round_shift(temp2)); + step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd); // stage 2 step2[0] = step1[0]; @@ -2408,40 +2441,40 @@ static void high_idct32(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(dct_const_round_shift(temp1)); - step2[15] = WRAPLOW(dct_const_round_shift(temp2)); + step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd); + step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1)); - step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd); + step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1)); - step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd); + step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd); - step2[16] = WRAPLOW(step1[16] + step1[17]); - step2[17] = WRAPLOW(step1[16] - step1[17]); - step2[18] = WRAPLOW(-step1[18] + step1[19]); - step2[19] = WRAPLOW(step1[18] + step1[19]); - step2[20] = WRAPLOW(step1[20] + step1[21]); - step2[21] = WRAPLOW(step1[20] - step1[21]); - step2[22] = WRAPLOW(-step1[22] + step1[23]); - step2[23] = WRAPLOW(step1[22] + step1[23]); - step2[24] = WRAPLOW(step1[24] + step1[25]); - step2[25] = WRAPLOW(step1[24] - step1[25]); - step2[26] = WRAPLOW(-step1[26] + step1[27]); - step2[27] = WRAPLOW(step1[26] + step1[27]); - step2[28] = WRAPLOW(step1[28] + step1[29]); - step2[29] = WRAPLOW(step1[28] - step1[29]); - step2[30] = WRAPLOW(-step1[30] + step1[31]); - step2[31] = WRAPLOW(step1[30] + step1[31]); + step2[16] = WRAPLOW(step1[16] + step1[17], bd); + step2[17] = WRAPLOW(step1[16] - step1[17], bd); + step2[18] = WRAPLOW(-step1[18] + step1[19], bd); + step2[19] = WRAPLOW(step1[18] + step1[19], bd); + step2[20] = WRAPLOW(step1[20] + step1[21], bd); + step2[21] = WRAPLOW(step1[20] - step1[21], bd); + step2[22] = WRAPLOW(-step1[22] + step1[23], bd); + step2[23] = WRAPLOW(step1[22] + step1[23], bd); + step2[24] = WRAPLOW(step1[24] + step1[25], bd); + step2[25] = WRAPLOW(step1[24] - step1[25], bd); + step2[26] = WRAPLOW(-step1[26] + step1[27], bd); + step2[27] = WRAPLOW(step1[26] + step1[27], bd); + step2[28] = WRAPLOW(step1[28] + step1[29], bd); + step2[29] = WRAPLOW(step1[28] - step1[29], bd); + step2[30] = WRAPLOW(-step1[30] + step1[31], bd); + step2[31] = WRAPLOW(step1[30] + step1[31], bd); // stage 3 step1[0] = step2[0]; @@ -2451,42 +2484,42 @@ static void high_idct32(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1)); - step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd); - step1[8] = WRAPLOW(step2[8] + step2[9]); - step1[9] = WRAPLOW(step2[8] - step2[9]); - step1[10] = WRAPLOW(-step2[10] + step2[11]); - step1[11] = WRAPLOW(step2[10] + step2[11]); - step1[12] = WRAPLOW(step2[12] + step2[13]); - step1[13] = WRAPLOW(step2[12] - step2[13]); - step1[14] = WRAPLOW(-step2[14] + step2[15]); - step1[15] = WRAPLOW(step2[14] + step2[15]); + step1[8] = WRAPLOW(step2[8] + step2[9], bd); + step1[9] = WRAPLOW(step2[8] - step2[9], bd); + step1[10] = WRAPLOW(-step2[10] + step2[11], bd); + step1[11] = WRAPLOW(step2[10] + step2[11], bd); + step1[12] = WRAPLOW(step2[12] + step2[13], bd); + step1[13] = WRAPLOW(step2[12] - step2[13], bd); + step1[14] = WRAPLOW(-step2[14] + step2[15], bd); + step1[15] = WRAPLOW(step2[14] + step2[15], bd); step1[16] = step2[16]; step1[31] = step2[31]; temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; - step1[17] = WRAPLOW(dct_const_round_shift(temp1)); - step1[30] = WRAPLOW(dct_const_round_shift(temp2)); + step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1)); - step1[29] = WRAPLOW(dct_const_round_shift(temp2)); + step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd); step1[19] = step2[19]; step1[20] = step2[20]; temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1)); - step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1)); - step1[25] = WRAPLOW(dct_const_round_shift(temp2)); + step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd); step1[23] = step2[23]; step1[24] = step2[24]; step1[27] = step2[27]; @@ -2495,87 +2528,87 @@ static void high_idct32(const tran_low_t *input, tran_low_t *output, int bd) { // stage 4 temp1 = (step1[0] + step1[1]) * cospi_16_64; temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1)); - step2[1] = WRAPLOW(dct_const_round_shift(temp2)); + step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd); + step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1)); - step2[3] = WRAPLOW(dct_const_round_shift(temp2)); - step2[4] = WRAPLOW(step1[4] + step1[5]); - step2[5] = WRAPLOW(step1[4] - step1[5]); - step2[6] = WRAPLOW(-step1[6] + step1[7]); - step2[7] = WRAPLOW(step1[6] + step1[7]); + step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd); + step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[4] = WRAPLOW(step1[4] + step1[5], bd); + step2[5] = WRAPLOW(step1[4] - step1[5], bd); + step2[6] = WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = WRAPLOW(step1[6] + step1[7], bd); step2[8] = step1[8]; step2[15] = step1[15]; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1)); - step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd); + step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd); step2[11] = step1[11]; step2[12] = step1[12]; - step2[16] = WRAPLOW(step1[16] + step1[19]); - step2[17] = WRAPLOW(step1[17] + step1[18]); - step2[18] = WRAPLOW(step1[17] - step1[18]); - step2[19] = WRAPLOW(step1[16] - step1[19]); - step2[20] = WRAPLOW(-step1[20] + step1[23]); - step2[21] = WRAPLOW(-step1[21] + step1[22]); - step2[22] = WRAPLOW(step1[21] + step1[22]); - step2[23] = WRAPLOW(step1[20] + step1[23]); + step2[16] = WRAPLOW(step1[16] + step1[19], bd); + step2[17] = WRAPLOW(step1[17] + step1[18], bd); + step2[18] = WRAPLOW(step1[17] - step1[18], bd); + step2[19] = WRAPLOW(step1[16] - step1[19], bd); + step2[20] = WRAPLOW(-step1[20] + step1[23], bd); + step2[21] = WRAPLOW(-step1[21] + step1[22], bd); + step2[22] = WRAPLOW(step1[21] + step1[22], bd); + step2[23] = WRAPLOW(step1[20] + step1[23], bd); - step2[24] = WRAPLOW(step1[24] + step1[27]); - step2[25] = WRAPLOW(step1[25] + step1[26]); - step2[26] = WRAPLOW(step1[25] - step1[26]); - step2[27] = WRAPLOW(step1[24] - step1[27]); - step2[28] = WRAPLOW(-step1[28] + step1[31]); - step2[29] = WRAPLOW(-step1[29] + step1[30]); - step2[30] = WRAPLOW(step1[29] + step1[30]); - step2[31] = WRAPLOW(step1[28] + step1[31]); + step2[24] = WRAPLOW(step1[24] + step1[27], bd); + step2[25] = WRAPLOW(step1[25] + step1[26], bd); + step2[26] = WRAPLOW(step1[25] - step1[26], bd); + step2[27] = WRAPLOW(step1[24] - step1[27], bd); + step2[28] = WRAPLOW(-step1[28] + step1[31], bd); + step2[29] = WRAPLOW(-step1[29] + step1[30], bd); + step2[30] = WRAPLOW(step1[29] + step1[30], bd); + step2[31] = WRAPLOW(step1[28] + step1[31], bd); // stage 5 - step1[0] = WRAPLOW(step2[0] + step2[3]); - step1[1] = WRAPLOW(step2[1] + step2[2]); - step1[2] = WRAPLOW(step2[1] - step2[2]); - step1[3] = WRAPLOW(step2[0] - step2[3]); + step1[0] = WRAPLOW(step2[0] + step2[3], bd); + step1[1] = WRAPLOW(step2[1] + step2[2], bd); + step1[2] = WRAPLOW(step2[1] - step2[2], bd); + step1[3] = WRAPLOW(step2[0] - step2[3], bd); step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd); step1[7] = step2[7]; - step1[8] = WRAPLOW(step2[8] + step2[11]); - step1[9] = WRAPLOW(step2[9] + step2[10]); - step1[10] = WRAPLOW(step2[9] - step2[10]); - step1[11] = WRAPLOW(step2[8] - step2[11]); - step1[12] = WRAPLOW(-step2[12] + step2[15]); - step1[13] = WRAPLOW(-step2[13] + step2[14]); - step1[14] = WRAPLOW(step2[13] + step2[14]); - step1[15] = WRAPLOW(step2[12] + step2[15]); + step1[8] = WRAPLOW(step2[8] + step2[11], bd); + step1[9] = WRAPLOW(step2[9] + step2[10], bd); + step1[10] = WRAPLOW(step2[9] - step2[10], bd); + step1[11] = WRAPLOW(step2[8] - step2[11], bd); + step1[12] = WRAPLOW(-step2[12] + step2[15], bd); + step1[13] = WRAPLOW(-step2[13] + step2[14], bd); + step1[14] = WRAPLOW(step2[13] + step2[14], bd); + step1[15] = WRAPLOW(step2[12] + step2[15], bd); step1[16] = step2[16]; step1[17] = step2[17]; temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1)); - step1[29] = WRAPLOW(dct_const_round_shift(temp2)); + step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; - step1[19] = WRAPLOW(dct_const_round_shift(temp1)); - step1[28] = WRAPLOW(dct_const_round_shift(temp2)); + step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1)); - step1[27] = WRAPLOW(dct_const_round_shift(temp2)); + step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1)); - step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd); step1[22] = step2[22]; step1[23] = step2[23]; step1[24] = step2[24]; @@ -2584,62 +2617,62 @@ static void high_idct32(const tran_low_t *input, tran_low_t *output, int bd) { step1[31] = step2[31]; // stage 6 - step2[0] = WRAPLOW(step1[0] + step1[7]); - step2[1] = WRAPLOW(step1[1] + step1[6]); - step2[2] = WRAPLOW(step1[2] + step1[5]); - step2[3] = WRAPLOW(step1[3] + step1[4]); - step2[4] = WRAPLOW(step1[3] - step1[4]); - step2[5] = WRAPLOW(step1[2] - step1[5]); - step2[6] = WRAPLOW(step1[1] - step1[6]); - step2[7] = WRAPLOW(step1[0] - step1[7]); + step2[0] = WRAPLOW(step1[0] + step1[7], bd); + step2[1] = WRAPLOW(step1[1] + step1[6], bd); + step2[2] = WRAPLOW(step1[2] + step1[5], bd); + step2[3] = WRAPLOW(step1[3] + step1[4], bd); + step2[4] = WRAPLOW(step1[3] - step1[4], bd); + step2[5] = WRAPLOW(step1[2] - step1[5], bd); + step2[6] = WRAPLOW(step1[1] - step1[6], bd); + step2[7] = WRAPLOW(step1[0] - step1[7], bd); step2[8] = step1[8]; step2[9] = step1[9]; temp1 = (-step1[10] + step1[13]) * cospi_16_64; temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = (-step1[11] + step1[12]) * cospi_16_64; temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1)); - step2[12] = WRAPLOW(dct_const_round_shift(temp2)); - step2[14] = WRAPLOW(step1[14]); - step2[15] = WRAPLOW(step1[15]); + step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd); + step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[14] = step1[14]; + step2[15] = step1[15]; - step2[16] = WRAPLOW(step1[16] + step1[23]); - step2[17] = WRAPLOW(step1[17] + step1[22]); - step2[18] = WRAPLOW(step1[18] + step1[21]); - step2[19] = WRAPLOW(step1[19] + step1[20]); - step2[20] = WRAPLOW(step1[19] - step1[20]); - step2[21] = WRAPLOW(step1[18] - step1[21]); - step2[22] = WRAPLOW(step1[17] - step1[22]); - step2[23] = WRAPLOW(step1[16] - step1[23]); + step2[16] = WRAPLOW(step1[16] + step1[23], bd); + step2[17] = WRAPLOW(step1[17] + step1[22], bd); + step2[18] = WRAPLOW(step1[18] + step1[21], bd); + step2[19] = WRAPLOW(step1[19] + step1[20], bd); + step2[20] = WRAPLOW(step1[19] - step1[20], bd); + step2[21] = WRAPLOW(step1[18] - step1[21], bd); + step2[22] = WRAPLOW(step1[17] - step1[22], bd); + step2[23] = WRAPLOW(step1[16] - step1[23], bd); - step2[24] = WRAPLOW(-step1[24] + step1[31]); - step2[25] = WRAPLOW(-step1[25] + step1[30]); - step2[26] = WRAPLOW(-step1[26] + step1[29]); - step2[27] = WRAPLOW(-step1[27] + step1[28]); - step2[28] = WRAPLOW(step1[27] + step1[28]); - step2[29] = WRAPLOW(step1[26] + step1[29]); - step2[30] = WRAPLOW(step1[25] + step1[30]); - step2[31] = WRAPLOW(step1[24] + step1[31]); + step2[24] = WRAPLOW(-step1[24] + step1[31], bd); + step2[25] = WRAPLOW(-step1[25] + step1[30], bd); + step2[26] = WRAPLOW(-step1[26] + step1[29], bd); + step2[27] = WRAPLOW(-step1[27] + step1[28], bd); + step2[28] = WRAPLOW(step1[27] + step1[28], bd); + step2[29] = WRAPLOW(step1[26] + step1[29], bd); + step2[30] = WRAPLOW(step1[25] + step1[30], bd); + step2[31] = WRAPLOW(step1[24] + step1[31], bd); // stage 7 - step1[0] = WRAPLOW(step2[0] + step2[15]); - step1[1] = WRAPLOW(step2[1] + step2[14]); - step1[2] = WRAPLOW(step2[2] + step2[13]); - step1[3] = WRAPLOW(step2[3] + step2[12]); - step1[4] = WRAPLOW(step2[4] + step2[11]); - step1[5] = WRAPLOW(step2[5] + step2[10]); - step1[6] = WRAPLOW(step2[6] + step2[9]); - step1[7] = WRAPLOW(step2[7] + step2[8]); - step1[8] = WRAPLOW(step2[7] - step2[8]); - step1[9] = WRAPLOW(step2[6] - step2[9]); - step1[10] = WRAPLOW(step2[5] - step2[10]); - step1[11] = WRAPLOW(step2[4] - step2[11]); - step1[12] = WRAPLOW(step2[3] - step2[12]); - step1[13] = WRAPLOW(step2[2] - step2[13]); - step1[14] = WRAPLOW(step2[1] - step2[14]); - step1[15] = WRAPLOW(step2[0] - step2[15]); + step1[0] = WRAPLOW(step2[0] + step2[15], bd); + step1[1] = WRAPLOW(step2[1] + step2[14], bd); + step1[2] = WRAPLOW(step2[2] + step2[13], bd); + step1[3] = WRAPLOW(step2[3] + step2[12], bd); + step1[4] = WRAPLOW(step2[4] + step2[11], bd); + step1[5] = WRAPLOW(step2[5] + step2[10], bd); + step1[6] = WRAPLOW(step2[6] + step2[9], bd); + step1[7] = WRAPLOW(step2[7] + step2[8], bd); + step1[8] = WRAPLOW(step2[7] - step2[8], bd); + step1[9] = WRAPLOW(step2[6] - step2[9], bd); + step1[10] = WRAPLOW(step2[5] - step2[10], bd); + step1[11] = WRAPLOW(step2[4] - step2[11], bd); + step1[12] = WRAPLOW(step2[3] - step2[12], bd); + step1[13] = WRAPLOW(step2[2] - step2[13], bd); + step1[14] = WRAPLOW(step2[1] - step2[14], bd); + step1[15] = WRAPLOW(step2[0] - step2[15], bd); step1[16] = step2[16]; step1[17] = step2[17]; @@ -2647,58 +2680,58 @@ static void high_idct32(const tran_low_t *input, tran_low_t *output, int bd) { step1[19] = step2[19]; temp1 = (-step2[20] + step2[27]) * cospi_16_64; temp2 = (step2[20] + step2[27]) * cospi_16_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1)); - step1[27] = WRAPLOW(dct_const_round_shift(temp2)); + step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = (-step2[21] + step2[26]) * cospi_16_64; temp2 = (step2[21] + step2[26]) * cospi_16_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1)); - step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = (-step2[22] + step2[25]) * cospi_16_64; temp2 = (step2[22] + step2[25]) * cospi_16_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1)); - step1[25] = WRAPLOW(dct_const_round_shift(temp2)); + step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = (-step2[23] + step2[24]) * cospi_16_64; temp2 = (step2[23] + step2[24]) * cospi_16_64; - step1[23] = WRAPLOW(dct_const_round_shift(temp1)); - step1[24] = WRAPLOW(dct_const_round_shift(temp2)); + step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd); + step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd); step1[28] = step2[28]; step1[29] = step2[29]; step1[30] = step2[30]; step1[31] = step2[31]; // final stage - output[0] = WRAPLOW(step1[0] + step1[31]); - output[1] = WRAPLOW(step1[1] + step1[30]); - output[2] = WRAPLOW(step1[2] + step1[29]); - output[3] = WRAPLOW(step1[3] + step1[28]); - output[4] = WRAPLOW(step1[4] + step1[27]); - output[5] = WRAPLOW(step1[5] + step1[26]); - output[6] = WRAPLOW(step1[6] + step1[25]); - output[7] = WRAPLOW(step1[7] + step1[24]); - output[8] = WRAPLOW(step1[8] + step1[23]); - output[9] = WRAPLOW(step1[9] + step1[22]); - output[10] = WRAPLOW(step1[10] + step1[21]); - output[11] = WRAPLOW(step1[11] + step1[20]); - output[12] = WRAPLOW(step1[12] + step1[19]); - output[13] = WRAPLOW(step1[13] + step1[18]); - output[14] = WRAPLOW(step1[14] + step1[17]); - output[15] = WRAPLOW(step1[15] + step1[16]); - output[16] = WRAPLOW(step1[15] - step1[16]); - output[17] = WRAPLOW(step1[14] - step1[17]); - output[18] = WRAPLOW(step1[13] - step1[18]); - output[19] = WRAPLOW(step1[12] - step1[19]); - output[20] = WRAPLOW(step1[11] - step1[20]); - output[21] = WRAPLOW(step1[10] - step1[21]); - output[22] = WRAPLOW(step1[9] - step1[22]); - output[23] = WRAPLOW(step1[8] - step1[23]); - output[24] = WRAPLOW(step1[7] - step1[24]); - output[25] = WRAPLOW(step1[6] - step1[25]); - output[26] = WRAPLOW(step1[5] - step1[26]); - output[27] = WRAPLOW(step1[4] - step1[27]); - output[28] = WRAPLOW(step1[3] - step1[28]); - output[29] = WRAPLOW(step1[2] - step1[29]); - output[30] = WRAPLOW(step1[1] - step1[30]); - output[31] = WRAPLOW(step1[0] - step1[31]); + output[0] = WRAPLOW(step1[0] + step1[31], bd); + output[1] = WRAPLOW(step1[1] + step1[30], bd); + output[2] = WRAPLOW(step1[2] + step1[29], bd); + output[3] = WRAPLOW(step1[3] + step1[28], bd); + output[4] = WRAPLOW(step1[4] + step1[27], bd); + output[5] = WRAPLOW(step1[5] + step1[26], bd); + output[6] = WRAPLOW(step1[6] + step1[25], bd); + output[7] = WRAPLOW(step1[7] + step1[24], bd); + output[8] = WRAPLOW(step1[8] + step1[23], bd); + output[9] = WRAPLOW(step1[9] + step1[22], bd); + output[10] = WRAPLOW(step1[10] + step1[21], bd); + output[11] = WRAPLOW(step1[11] + step1[20], bd); + output[12] = WRAPLOW(step1[12] + step1[19], bd); + output[13] = WRAPLOW(step1[13] + step1[18], bd); + output[14] = WRAPLOW(step1[14] + step1[17], bd); + output[15] = WRAPLOW(step1[15] + step1[16], bd); + output[16] = WRAPLOW(step1[15] - step1[16], bd); + output[17] = WRAPLOW(step1[14] - step1[17], bd); + output[18] = WRAPLOW(step1[13] - step1[18], bd); + output[19] = WRAPLOW(step1[12] - step1[19], bd); + output[20] = WRAPLOW(step1[11] - step1[20], bd); + output[21] = WRAPLOW(step1[10] - step1[21], bd); + output[22] = WRAPLOW(step1[9] - step1[22], bd); + output[23] = WRAPLOW(step1[8] - step1[23], bd); + output[24] = WRAPLOW(step1[7] - step1[24], bd); + output[25] = WRAPLOW(step1[6] - step1[25], bd); + output[26] = WRAPLOW(step1[5] - step1[26], bd); + output[27] = WRAPLOW(step1[4] - step1[27], bd); + output[28] = WRAPLOW(step1[3] - step1[28], bd); + output[29] = WRAPLOW(step1[2] - step1[29], bd); + output[30] = WRAPLOW(step1[1] - step1[30], bd); + output[31] = WRAPLOW(step1[0] - step1[31], bd); } void vp9_high_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, @@ -2722,7 +2755,7 @@ void vp9_high_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; if (zero_coeff[0] | zero_coeff[1]) - high_idct32(input, outptr, bd); + highbd_idct32(input, outptr, bd); else vpx_memset(outptr, 0, sizeof(tran_low_t) * 32); input += 32; @@ -2733,10 +2766,11 @@ void vp9_high_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, for (i = 0; i < 32; ++i) { for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; - high_idct32(temp_in, temp_out, bd); - for (j = 0; j < 32; ++j) - dest[j * stride + i] = clip_pixel_bd_high( + highbd_idct32(temp_in, temp_out, bd); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } } } @@ -2751,7 +2785,7 @@ void vp9_high_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, // Rows // Only upper-left 8x8 has non-zero coeff. for (i = 0; i < 8; ++i) { - high_idct32(input, outptr, bd); + highbd_idct32(input, outptr, bd); input += 32; outptr += 32; } @@ -2759,10 +2793,11 @@ void vp9_high_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, for (i = 0; i < 32; ++i) { for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; - high_idct32(temp_in, temp_out, bd); - for (j = 0; j < 32; ++j) - dest[j * stride + i] = clip_pixel_bd_high( + highbd_idct32(temp_in, temp_out, bd); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } } } @@ -2772,13 +2807,13 @@ void vp9_high_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, int a1; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 32; ++j) { for (i = 0; i < 32; ++i) - dest[i] = clip_pixel_bd_high(dest[i], a1, bd); + dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); dest += stride; } } diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c index 7ebd2ea87..84d9d34b7 100644 --- a/vp9/common/vp9_reconintra.c +++ b/vp9/common/vp9_reconintra.c @@ -251,7 +251,7 @@ static INLINE void high_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bs, for (r = 0; r < bs; r++) { for (c = 0; c < bs; c++) - dst[c] = clip_pixel_high(left[r] + above[c] - ytop_left, bd); + dst[c] = clip_pixel_highbd(left[r] + above[c] - ytop_left, bd); dst += stride; } } diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 9451a1c8f..1d9a20670 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -331,6 +331,8 @@ $vp9_convolve8_avg_vert_neon_asm=vp9_convolve8_avg_vert_neon; # dct # if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + # Note as optimized versions of these functions are added we need to add a check to ensure + # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vp9_idct4x4_1_add/; @@ -380,69 +382,123 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vp9_iwht4x4_16_add/; + } else { - add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/; - $vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon; + # Force C versions if CONFIG_EMULATE_HARDWARE is 1 + if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") { + add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct4x4_1_add/; - add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/; - $vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon; + add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct4x4_16_add/; - add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/; - $vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon; + add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_1_add/; - add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64"; - $vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon; + add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_64_add/; - add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64"; - $vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon; + add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_12_add/; - add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/; - $vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon; + add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_1_add/; - add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/; - $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon; + add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_256_add/; - add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/; - $vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon; + add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_10_add/; - add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/; - $vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon; + add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_1024_add/; - add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/; - $vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon; + add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_34_add/; - add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/; - $vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon; + add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_1_add/; - add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; - specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/; - $vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon; + add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp9_iht4x4_16_add/; - add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; - specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/; - $vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon; + add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp9_iht8x8_64_add/; - add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; - specialize qw/vp9_iht16x16_256_add sse2 dspr2/; + add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; + specialize qw/vp9_iht16x16_256_add/; - # dct and add + # dct and add - add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_iwht4x4_1_add/; + add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_iwht4x4_1_add/; - add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_iwht4x4_16_add/; + add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_iwht4x4_16_add/; + } else { + add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/; + $vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon; + + add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/; + $vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon; + + add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/; + $vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon; + + add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64"; + $vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon; + + add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64"; + $vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon; + + add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/; + $vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon; + + add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/; + $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon; + + add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/; + $vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon; + + add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/; + $vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon; + + add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/; + $vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon; + + add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/; + $vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon; + + add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/; + $vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon; + + add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/; + $vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon; + + add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; + specialize qw/vp9_iht16x16_256_add sse2 dspr2/; + + # dct and add + + add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_iwht4x4_1_add/; + + add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vp9_iwht4x4_16_add/; + } } # High bitdepth functions @@ -689,6 +745,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # dct # + # Note as optimized versions of these functions are added we need to add a check to ensure + # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. add_proto qw/void vp9_high_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; specialize qw/vp9_high_idct4x4_1_add/; diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c index 4a8a52156..3d361d4f2 100644 --- a/vp9/encoder/vp9_resize.c +++ b/vp9/encoder/vp9_resize.c @@ -571,7 +571,7 @@ static void highbd_interpolate(const uint16_t *const input, int inlength, sum += filter[k] * input[(pk < 0 ? 0 : (pk >= inlength ? inlength - 1 : pk))]; } - *optr++ = clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); } } else { // Initial part. @@ -585,7 +585,7 @@ static void highbd_interpolate(const uint16_t *const input, int inlength, sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0 ? 0 : int_pel - INTERP_TAPS / 2 + 1 + k)]; - *optr++ = clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); } // Middle part. for (; x <= x2; ++x, y += delta) { @@ -596,7 +596,7 @@ static void highbd_interpolate(const uint16_t *const input, int inlength, sum = 0; for (k = 0; k < INTERP_TAPS; ++k) sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k]; - *optr++ = clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); } // End part. for (; x < outlength; ++x, y += delta) { @@ -609,7 +609,7 @@ static void highbd_interpolate(const uint16_t *const input, int inlength, sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >= inlength ? inlength - 1 : int_pel - INTERP_TAPS / 2 + 1 + k)]; - *optr++ = clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); } } } @@ -635,7 +635,7 @@ static void highbd_down2_symeven(const uint16_t *const input, int length, filter[j]; } sum >>= FILTER_BITS; - *optr++ = clip_pixel_high(sum, bd); + *optr++ = clip_pixel_highbd(sum, bd); } } else { // Initial part. @@ -645,7 +645,7 @@ static void highbd_down2_symeven(const uint16_t *const input, int length, sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + 1 + j]) * filter[j]; } sum >>= FILTER_BITS; - *optr++ = clip_pixel_high(sum, bd); + *optr++ = clip_pixel_highbd(sum, bd); } // Middle part. for (; i < l2; i += 2) { @@ -654,7 +654,7 @@ static void highbd_down2_symeven(const uint16_t *const input, int length, sum += (input[i - j] + input[i + 1 + j]) * filter[j]; } sum >>= FILTER_BITS; - *optr++ = clip_pixel_high(sum, bd); + *optr++ = clip_pixel_highbd(sum, bd); } // End part. for (; i < length; i += 2) { @@ -665,7 +665,7 @@ static void highbd_down2_symeven(const uint16_t *const input, int length, filter[j]; } sum >>= FILTER_BITS; - *optr++ = clip_pixel_high(sum, bd); + *optr++ = clip_pixel_highbd(sum, bd); } } } @@ -691,7 +691,7 @@ static void highbd_down2_symodd(const uint16_t *const input, int length, filter[j]; } sum >>= FILTER_BITS; - *optr++ = clip_pixel_high(sum, bd); + *optr++ = clip_pixel_highbd(sum, bd); } } else { // Initial part. @@ -701,7 +701,7 @@ static void highbd_down2_symodd(const uint16_t *const input, int length, sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j]; } sum >>= FILTER_BITS; - *optr++ = clip_pixel_high(sum, bd); + *optr++ = clip_pixel_highbd(sum, bd); } // Middle part. for (; i < l2; i += 2) { @@ -710,7 +710,7 @@ static void highbd_down2_symodd(const uint16_t *const input, int length, sum += (input[i - j] + input[i + j]) * filter[j]; } sum >>= FILTER_BITS; - *optr++ = clip_pixel_high(sum, bd); + *optr++ = clip_pixel_highbd(sum, bd); } // End part. for (; i < length; i += 2) { @@ -720,7 +720,7 @@ static void highbd_down2_symodd(const uint16_t *const input, int length, filter[j]; } sum >>= FILTER_BITS; - *optr++ = clip_pixel_high(sum, bd); + *optr++ = clip_pixel_highbd(sum, bd); } } }