From d5de63d2be0e8547358343c6e123eeb43f3433d9 Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Wed, 3 May 2017 13:32:08 -0700 Subject: [PATCH] Update highbd idct functions arguments to use uint16_t dst BUG=webm:1388 Change-Id: I3581d80d0389b99166e70987d38aba2db6c469d5 --- test/dct16x16_test.cc | 20 ++++----- test/dct32x32_test.cc | 4 +- test/fdct4x4_test.cc | 16 +++---- test/fdct8x8_test.cc | 20 ++++----- test/partial_idct_test.cc | 6 ++- vp9/common/vp9_idct.c | 25 +++++------ vp9/common/vp9_idct.h | 16 +++---- vp9/common/vp9_rtcd_defs.pl | 6 +-- vp9/decoder/vp9_decodeframe.c | 4 +- vp9/encoder/vp9_block.h | 2 +- vp9/encoder/vp9_encodemb.c | 8 ++-- vp9/encoder/vp9_rdopt.c | 13 +++--- vpx_dsp/arm/highbd_idct16x16_add_neon.c | 15 ++----- vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c | 9 ++-- vpx_dsp/arm/highbd_idct32x32_135_add_neon.c | 3 +- vpx_dsp/arm/highbd_idct32x32_34_add_neon.c | 3 +- vpx_dsp/arm/highbd_idct32x32_add_neon.c | 3 +- vpx_dsp/arm/highbd_idct4x4_add_neon.c | 6 +-- vpx_dsp/arm/highbd_idct8x8_add_neon.c | 9 ++-- vpx_dsp/inv_txfm.c | 45 +++++++------------- vpx_dsp/vpx_dsp_rtcd_defs.pl | 30 ++++++------- vpx_dsp/x86/inv_txfm_sse2.c | 18 +++----- 22 files changed, 121 insertions(+), 160 deletions(-) diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index 4de36c2c6..6ea77fde2 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -255,11 +255,11 @@ void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride, #if CONFIG_VP9_HIGHBITDEPTH void idct16x16_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_256_add_c(in, out, stride, 10); + vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_256_add_c(in, out, stride, 12); + vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride, @@ -273,36 +273,36 @@ void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride, } void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 10); + vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10); } void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12); + vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12); } #if HAVE_SSE2 void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_10_add_c(in, out, stride, 10); + vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_10_add_c(in, out, stride, 12); + vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 10); + vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 12); + vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); } void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 10); + vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 12); + vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); } #endif // HAVE_SSE2 #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc index b0d5c4d06..d8054c4eb 100644 --- a/test/dct32x32_test.cc +++ b/test/dct32x32_test.cc @@ -71,11 +71,11 @@ typedef std::tr1::tuple #if CONFIG_VP9_HIGHBITDEPTH void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct32x32_1024_add_c(in, out, stride, 10); + vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct32x32_1024_add_c(in, out, stride, 12); + vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc index 4836a0453..bd2327520 100644 --- a/test/fdct4x4_test.cc +++ b/test/fdct4x4_test.cc @@ -55,36 +55,36 @@ void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride, #if CONFIG_VP9_HIGHBITDEPTH void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct4x4_16_add_c(in, out, stride, 10); + vpx_highbd_idct4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct4x4_16_add_c(in, out, stride, 12); + vpx_highbd_idct4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 10); + vp9_highbd_iht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10); } void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 12); + vp9_highbd_iht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12); } void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_iwht4x4_16_add_c(in, out, stride, 10); + vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_iwht4x4_16_add_c(in, out, stride, 12); + vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } #if HAVE_SSE2 void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 10); + vpx_highbd_idct4x4_16_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 12); + vpx_highbd_idct4x4_16_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); } #endif // HAVE_SSE2 #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 9727da93d..06c23c520 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -88,45 +88,45 @@ void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) { #if CONFIG_VP9_HIGHBITDEPTH void idct8x8_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_64_add_c(in, out, stride, 10); + vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct8x8_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_64_add_c(in, out, stride, 12); + vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 10); + vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10); } void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12); + vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12); } #if HAVE_SSE2 void idct8x8_12_add_10_c(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_12_add_c(in, out, stride, 10); + vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct8x8_12_add_12_c(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_12_add_c(in, out, stride, 12); + vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } void idct8x8_12_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 10); + vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct8x8_12_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 12); + vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); } void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 10); + vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 12); + vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); } #endif // HAVE_SSE2 #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index 893165fa3..5175c93b9 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -43,9 +43,11 @@ void wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) { } #if CONFIG_VP9_HIGHBITDEPTH -template +typedef void (*InvTxfmHighbdFunc)(const tran_low_t *in, uint16_t *out, + int stride, int bd); +template void highbd_wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) { - fn(in, out, stride, bd); + fn(in, CAST_TO_SHORTPTR(out), stride, bd); } #endif diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index afdae5996..69069042c 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -205,7 +205,7 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, #if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd) { const highbd_transform_2d IHT_4[] = { { vpx_highbd_idct4_c, vpx_highbd_idct4_c }, // DCT_DCT = 0 @@ -213,7 +213,6 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, { vpx_highbd_idct4_c, vpx_highbd_iadst4_c }, // DCT_ADST = 2 { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c } // ADST_ADST = 3 }; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); int i, j; tran_low_t out[4 * 4]; @@ -245,14 +244,13 @@ static const highbd_transform_2d HIGH_IHT_8[] = { { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c } // ADST_ADST = 3 }; -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd) { int i, j; tran_low_t out[8 * 8]; tran_low_t *outptr = out; tran_low_t temp_in[8], temp_out[8]; const highbd_transform_2d ht = HIGH_IHT_8[tx_type]; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); // Inverse transform row vectors. for (i = 0; i < 8; ++i) { @@ -279,14 +277,13 @@ static const highbd_transform_2d HIGH_IHT_16[] = { { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c } // ADST_ADST = 3 }; -void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd) { int i, j; tran_low_t out[16 * 16]; tran_low_t *outptr = out; tran_low_t temp_in[16], temp_out[16]; const highbd_transform_2d ht = HIGH_IHT_16[tx_type]; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); // Rows for (i = 0; i < 16; ++i) { @@ -307,7 +304,7 @@ void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, } // idct -void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { if (eob > 1) vpx_highbd_idct4x4_16_add(input, dest, stride, bd); @@ -315,7 +312,7 @@ void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, vpx_highbd_idct4x4_1_add(input, dest, stride, bd); } -void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { if (eob > 1) vpx_highbd_iwht4x4_16_add(input, dest, stride, bd); @@ -323,7 +320,7 @@ void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, vpx_highbd_iwht4x4_1_add(input, dest, stride, bd); } -void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { // If dc is 1, then input[0] is the reconstructed value, do not need // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. @@ -340,7 +337,7 @@ void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, } } -void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest, +void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { // The calculation can be simplified if there are not many non-zero dct // coefficients. Use eobs to separate different cases. @@ -356,7 +353,7 @@ void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest, } } -void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest, +void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { // Non-zero coeff only in upper-left 8x8 if (eob == 1) { @@ -372,7 +369,7 @@ void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest, // iht void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd) { + uint16_t *dest, int stride, int eob, int bd) { if (tx_type == DCT_DCT) vp9_highbd_idct4x4_add(input, dest, stride, eob, bd); else @@ -380,7 +377,7 @@ void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, } void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd) { + uint16_t *dest, int stride, int eob, int bd) { if (tx_type == DCT_DCT) { vp9_highbd_idct8x8_add(input, dest, stride, eob, bd); } else { @@ -389,7 +386,7 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, } void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd) { + uint16_t *dest, int stride, int eob, int bd) { if (tx_type == DCT_DCT) { vp9_highbd_idct16x16_add(input, dest, stride, eob, bd); } else { diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h index ea958a38c..3e83b8402 100644 --- a/vp9/common/vp9_idct.h +++ b/vp9/common/vp9_idct.h @@ -57,22 +57,22 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, int stride, int eob); #if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest, +void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest, +void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd); + uint16_t *dest, int stride, int eob, int bd); void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd); + uint16_t *dest, int stride, int eob, int bd); void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd); + uint16_t *dest, int stride, int eob, int bd); #endif // CONFIG_VP9_HIGHBITDEPTH #ifdef __cplusplus } // extern "C" diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index da449e254..ced0c22cb 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -101,11 +101,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Note as optimized versions of these functions are added we need to add a check to ensure # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. - add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd"; + add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd"; - add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd"; + add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd"; - add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd"; + add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd"; } # diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 8023ebd57..0760f8c23 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -189,7 +189,7 @@ static void inverse_transform_block_inter(MACROBLOCKD *xd, int plane, assert(eob > 0); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - uint8_t *const dst16 = CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst)); + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); if (xd->lossless) { vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd); } else { @@ -257,7 +257,7 @@ static void inverse_transform_block_intra(MACROBLOCKD *xd, int plane, assert(eob > 0); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - uint8_t *const dst16 = CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst)); + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); if (xd->lossless) { vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd); } else { diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index bdd666286..ab488f48f 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -184,7 +184,7 @@ struct macroblock { void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride); void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob); #if CONFIG_VP9_HIGHBITDEPTH - void (*highbd_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, + void (*highbd_itxm_add)(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); #endif }; diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 9d3152e1e..7e30499c5 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -637,7 +637,7 @@ static void encode_block(int plane, int block, int row, int col, if (x->skip_encode || p->eobs[block] == 0) return; #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - uint8_t *const dst16 = CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst)); + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); switch (tx_size) { case TX_32X32: vp9_highbd_idct32x32_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], @@ -700,8 +700,8 @@ static void encode_block_pass1(int plane, int block, int row, int col, if (p->eobs[block] > 0) { #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - x->highbd_itxm_add(dqcoeff, CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst)), - pd->dst.stride, p->eobs[block], xd->bd); + x->highbd_itxm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride, + p->eobs[block], xd->bd); return; } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -801,7 +801,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - uint8_t *const dst16 = CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst)); + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); switch (tx_size) { case TX_32X32: if (!x->skip_recode) { diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index be4ff234d..bf0fec3d8 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -601,22 +601,21 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16, 32, NULL, 0, NULL, 0, bs, bs, xd->bd); - recon = CAST_TO_BYTEPTR(recon16); if (xd->lossless) { - vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd); } else { switch (tx_size) { case TX_4X4: - vp9_highbd_idct4x4_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, *eob, xd->bd); break; case TX_8X8: - vp9_highbd_idct8x8_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, *eob, xd->bd); break; case TX_16X16: - vp9_highbd_idct16x16_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, *eob, xd->bd); break; case TX_32X32: - vp9_highbd_idct32x32_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, *eob, xd->bd); break; default: assert(0 && "Invalid transform size"); } @@ -1005,7 +1004,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, const int block = (row + idy) * 2 + (col + idx); const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride]; uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride]; - uint8_t *const dst16 = CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst)); + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); int16_t *const src_diff = vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff); tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block); diff --git a/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/vpx_dsp/arm/highbd_idct16x16_add_neon.c index 486a33148..98e42cd25 100644 --- a/vpx_dsp/arm/highbd_idct16x16_add_neon.c +++ b/vpx_dsp/arm/highbd_idct16x16_add_neon.c @@ -1268,10 +1268,8 @@ void vpx_highbd_idct16x16_10_add_half1d_pass2(const int32_t *input, } } -void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - uint16_t *dest = CAST_TO_SHORTPTR(dest8); - if (bd == 8) { int16_t row_idct_output[16 * 16]; @@ -1313,10 +1311,8 @@ void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - uint16_t *dest = CAST_TO_SHORTPTR(dest8); - if (bd == 8) { int16_t row_idct_output[16 * 16]; @@ -1349,10 +1345,8 @@ void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - uint16_t *dest = CAST_TO_SHORTPTR(dest8); - if (bd == 8) { int16_t row_idct_output[4 * 16]; @@ -1414,7 +1408,7 @@ static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest, *dest += stride; } -void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const tran_low_t out0 = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); @@ -1422,7 +1416,6 @@ void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest8, HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); const int16x8_t dc = vdupq_n_s16(a1); - uint16_t *dest = CAST_TO_SHORTPTR(dest8); int i; if (a1 >= 0) { diff --git a/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c b/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c index 9646f744f..96a55c472 100644 --- a/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c +++ b/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c @@ -386,15 +386,14 @@ static INLINE void idct32_bands_end_2nd_pass(const int32_t *const out, } static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input, - uint8_t *const dest, - const int stride, const int bd) { + uint16_t *dst, const int stride, + const int bd) { int i, idct32_pass_loop; int32_t trans_buf[32 * 8]; int32_t pass1[32 * 32]; int32_t pass2[32 * 32]; int32_t *out; int32x4x2_t q[16]; - uint16_t *dst = CAST_TO_SHORTPTR(dest); for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2; idct32_pass_loop++, input = pass1, out = pass2) { @@ -637,10 +636,10 @@ static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input, } } -void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, +void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { if (bd == 8) { - vpx_idct32_32_neon(input, dest, stride, 1); + vpx_idct32_32_neon(input, CAST_TO_BYTEPTR(dest), stride, 1); } else { vpx_highbd_idct32_32_neon(input, dest, stride, bd); } diff --git a/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c b/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c index b2c776b57..3970a5a86 100644 --- a/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c +++ b/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c @@ -726,10 +726,9 @@ static void vpx_highbd_idct32_16_neon(const int32_t *const input, highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd); } -void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); if (bd == 8) { int16_t temp[32 * 16]; diff --git a/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c b/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c index 41622a249..5d9063b15 100644 --- a/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c +++ b/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c @@ -594,10 +594,9 @@ static void vpx_highbd_idct32_8_neon(const int32_t *input, uint16_t *output, highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd); } -void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); if (bd == 8) { int16_t temp[32 * 8]; diff --git a/vpx_dsp/arm/highbd_idct32x32_add_neon.c b/vpx_dsp/arm/highbd_idct32x32_add_neon.c index e7bfb3e7f..63eb49678 100644 --- a/vpx_dsp/arm/highbd_idct32x32_add_neon.c +++ b/vpx_dsp/arm/highbd_idct32x32_add_neon.c @@ -59,7 +59,7 @@ static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest, *dest += stride; } -void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const tran_low_t out0 = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); @@ -67,7 +67,6 @@ void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8, HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); const int16x8_t dc = vdupq_n_s16(a1); - uint16_t *dest = CAST_TO_SHORTPTR(dest8); int i; if (a1 >= 0) { diff --git a/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/vpx_dsp/arm/highbd_idct4x4_add_neon.c index dd8a3d55e..20b09f683 100644 --- a/vpx_dsp/arm/highbd_idct4x4_add_neon.c +++ b/vpx_dsp/arm/highbd_idct4x4_add_neon.c @@ -51,7 +51,7 @@ static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest, *dest += stride; } -void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const int16x8_t max = vdupq_n_s16((1 << bd) - 1); const tran_low_t out0 = @@ -60,7 +60,6 @@ void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8, HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4); const int16x8_t dc = vdupq_n_s16(a1); - uint16_t *dest = CAST_TO_SHORTPTR(dest8); highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max); highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max); @@ -133,14 +132,13 @@ static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis, *a3 = vsubq_s32(b0, b3); } -void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const int16x8_t max = vdupq_n_s16((1 << bd) - 1); int32x4_t c0 = vld1q_s32(input); int32x4_t c1 = vld1q_s32(input + 4); int32x4_t c2 = vld1q_s32(input + 8); int32x4_t c3 = vld1q_s32(input + 12); - uint16_t *dest = CAST_TO_SHORTPTR(dest8); int16x8_t a0, a1; if (bd == 8) { diff --git a/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/vpx_dsp/arm/highbd_idct8x8_add_neon.c index 50b82779a..6687e7649 100644 --- a/vpx_dsp/arm/highbd_idct8x8_add_neon.c +++ b/vpx_dsp/arm/highbd_idct8x8_add_neon.c @@ -36,7 +36,7 @@ static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest, *dest += stride; } -void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const tran_low_t out0 = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); @@ -44,7 +44,6 @@ void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8, HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5); const int16x8_t dc = vdupq_n_s16(a1); - uint16_t *dest = CAST_TO_SHORTPTR(dest8); if (a1 >= 0) { const int16x8_t max = vdupq_n_s16((1 << bd) - 1); @@ -292,9 +291,8 @@ static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2, vst1q_u16(dest, d7_u16); } -void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - uint16_t *dest = CAST_TO_SHORTPTR(dest8); int32x4_t a0 = vld1q_s32(input); int32x4_t a1 = vld1q_s32(input + 8); int32x4_t a2 = vld1q_s32(input + 16); @@ -553,9 +551,8 @@ static INLINE void idct8x8_64_half1d_bd12( *io7 = vsubq_s32(step1[0], step2[7]); } -void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - uint16_t *dest = CAST_TO_SHORTPTR(dest8); int32x4_t a0 = vld1q_s32(input); int32x4_t a1 = vld1q_s32(input + 4); int32x4_t a2 = vld1q_s32(input + 8); diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c index 14aa8ba22..07631b84f 100644 --- a/vpx_dsp/inv_txfm.c +++ b/vpx_dsp/inv_txfm.c @@ -1290,7 +1290,7 @@ static INLINE int detect_invalid_highbd_input(const tran_low_t *input, return 0; } -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 0.5 shifts per pixel. */ @@ -1299,7 +1299,6 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, tran_high_t a1, b1, c1, d1, e1; const tran_low_t *ip = input; tran_low_t *op = output; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); for (i = 0; i < 4; i++) { a1 = ip[0] >> UNIT_QUANT_SHIFT; @@ -1348,14 +1347,13 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest, int stride, int bd) { int i; tran_high_t a1, e1; tran_low_t tmp[4]; const tran_low_t *ip = in; tran_low_t *op = tmp; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); (void)bd; a1 = ip[0] >> UNIT_QUANT_SHIFT; @@ -1452,13 +1450,12 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd); } -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[4 * 4]; tran_low_t *outptr = out; tran_low_t temp_in[4], temp_out[4]; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); // Rows for (i = 0; i < 4; ++i) { @@ -1478,13 +1475,12 @@ void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i; tran_high_t a1; tran_low_t out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - uint16_t *dest = CAST_TO_SHORTPTR(dest8); out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 4); @@ -1636,13 +1632,12 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); } -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[8 * 8]; tran_low_t *outptr = out; tran_low_t temp_in[8], temp_out[8]; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); // First transform rows for (i = 0; i < 8; ++i) { @@ -1662,13 +1657,12 @@ void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[8 * 8] = { 0 }; tran_low_t *outptr = out; tran_low_t temp_in[8], temp_out[8]; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); // First transform rows // Only first 4 row has non-zero coefs @@ -1689,13 +1683,12 @@ void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_high_t a1; tran_low_t out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - uint16_t *dest = CAST_TO_SHORTPTR(dest8); out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 5); @@ -2056,13 +2049,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); } -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[16 * 16]; tran_low_t *outptr = out; tran_low_t temp_in[16], temp_out[16]; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); // First transform rows for (i = 0; i < 16; ++i) { @@ -2082,13 +2074,12 @@ void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[16 * 16] = { 0 }; tran_low_t *outptr = out; tran_low_t temp_in[16], temp_out[16]; - uint16_t *const dest = CAST_TO_SHORTPTR(dest8); // First transform rows. Since all non-zero dct coefficients are in // upper-left 8x8 area, we only need to calculate first 8 rows here. @@ -2111,13 +2102,12 @@ void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[16 * 16] = { 0 }; tran_low_t *outptr = out; tran_low_t temp_in[16], temp_out[16]; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); // First transform rows. Since all non-zero dct coefficients are in // upper-left 4x4 area, we only need to calculate first 4 rows here. @@ -2138,13 +2128,12 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_high_t a1; tran_low_t out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - uint16_t *dest = CAST_TO_SHORTPTR(dest8); out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 6); @@ -2531,13 +2520,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd); } -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[32 * 32]; tran_low_t *outptr = out; tran_low_t temp_in[32], temp_out[32]; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); // Rows for (i = 0; i < 32; ++i) { @@ -2569,13 +2557,12 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[32 * 32] = { 0 }; tran_low_t *outptr = out; tran_low_t temp_in[32], temp_out[32]; - uint16_t *const dest = CAST_TO_SHORTPTR(dest8); // Rows // Only upper-left 16x16 has non-zero coeff @@ -2598,13 +2585,12 @@ void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[32 * 32] = { 0 }; tran_low_t *outptr = out; tran_low_t temp_in[32], temp_out[32]; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); // Rows // Only upper-left 8x8 has non-zero coeff @@ -2625,11 +2611,10 @@ void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; int a1; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); tran_low_t out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 24e5b8b91..739174972 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -629,39 +629,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. specialize qw/vpx_iwht4x4_16_add sse2/; - add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct4x4_1_add neon/; - add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct8x8_1_add neon/; - add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct16x16_1_add neon/; - add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct32x32_1_add neon sse2/; - add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { specialize qw/vpx_highbd_idct4x4_16_add neon sse2/; diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index d469e5219..81e09dcb5 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -3364,7 +3364,7 @@ static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { return retval; } -void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd) { tran_low_t out[4 * 4]; tran_low_t *outptr = out; @@ -3373,7 +3373,6 @@ void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, __m128i sign_bits[2]; __m128i temp_mm, min_input, max_input; int test; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); int optimised_cols = 0; const __m128i zero = _mm_set1_epi16(0); const __m128i eight = _mm_set1_epi16(8); @@ -3479,14 +3478,13 @@ void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd) { tran_low_t out[8 * 8]; tran_low_t *outptr = out; int i, j, test; __m128i inptr[8]; __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); const __m128i zero = _mm_set1_epi16(0); const __m128i sixteen = _mm_set1_epi16(16); const __m128i max = _mm_set1_epi16(6201); @@ -3579,14 +3577,13 @@ void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd) { tran_low_t out[8 * 8] = { 0 }; tran_low_t *outptr = out; int i, j, test; __m128i inptr[8]; __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); const __m128i zero = _mm_set1_epi16(0); const __m128i sixteen = _mm_set1_epi16(16); const __m128i max = _mm_set1_epi16(6201); @@ -3682,14 +3679,13 @@ void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd) { tran_low_t out[16 * 16]; tran_low_t *outptr = out; int i, j, test; __m128i inptr[32]; __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); const __m128i zero = _mm_set1_epi16(0); const __m128i rounding = _mm_set1_epi16(32); const __m128i max = _mm_set1_epi16(3155); @@ -3795,14 +3791,13 @@ void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd) { tran_low_t out[16 * 16] = { 0 }; tran_low_t *outptr = out; int i, j, test; __m128i inptr[32]; __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); const __m128i zero = _mm_set1_epi16(0); const __m128i rounding = _mm_set1_epi16(32); const __m128i max = _mm_set1_epi16(3155); @@ -3913,14 +3908,13 @@ void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd) { __m128i dc_value, d; const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); int a, i, j; - uint16_t *dest = CAST_TO_SHORTPTR(dest8); tran_low_t out; out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);