diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl index 0d14ad85b..440470187 100644 --- a/vp10/common/vp10_rtcd_defs.pl +++ b/vp10/common/vp10_rtcd_defs.pl @@ -87,65 +87,127 @@ specialize qw/vp10_filter_by_weight8x8 sse2 msa/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # Note as optimized versions of these functions are added we need to add a check to ensure # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. - add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; - specialize qw/vp10_iht4x4_16_add/; + if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") { + add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp10_iht4x4_16_add/; - add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; - specialize qw/vp10_iht8x8_64_add/; + add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp10_iht8x8_64_add/; - add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; - specialize qw/vp10_iht16x16_256_add/; + add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; + specialize qw/vp10_iht16x16_256_add/; - add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_fdct4x4 sse2/; + add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct4x4/; - add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_fdct4x4_1 sse2/; + add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct4x4_1/; - add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_fdct8x8 sse2/; + add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct8x8/; - add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_fdct8x8_1 sse2/; + add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct8x8_1/; - add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_fdct16x16 sse2/; + add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct16x16/; - add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_fdct16x16_1 sse2/; + add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct16x16_1/; - add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_fdct32x32 sse2/; + add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct32x32/; - add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_fdct32x32_rd sse2/; + add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct32x32_rd/; - add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_fdct32x32_1 sse2/; + add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct32x32_1/; - add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_highbd_fdct4x4 sse2/; + add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct4x4/; - add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_highbd_fdct8x8 sse2/; + add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct8x8/; - add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_highbd_fdct8x8_1/; + add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct8x8_1/; - add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_highbd_fdct16x16 sse2/; + add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct16x16/; - add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_highbd_fdct16x16_1/; + add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct16x16_1/; - add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_highbd_fdct32x32 sse2/; + add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct32x32/; - add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_highbd_fdct32x32_rd sse2/; + add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct32x32_rd/; - add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_highbd_fdct32x32_1/; + add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct32x32_1/; + } else { + add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp10_iht4x4_16_add sse2/; + + add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp10_iht8x8_64_add sse2/; + + add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; + specialize qw/vp10_iht16x16_256_add/; + + add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct4x4 sse2/; + + add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct4x4_1 sse2/; + + add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct8x8 sse2/; + + add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct8x8_1 sse2/; + + add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct16x16 sse2/; + + add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct16x16_1 sse2/; + + add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct32x32 sse2/; + + add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct32x32_rd sse2/; + + add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct32x32_1 sse2/; + + add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct4x4 sse2/; + + add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct8x8 sse2/; + + add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct8x8_1/; + + add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct16x16 sse2/; + + add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct16x16_1/; + + add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct32x32 sse2/; + + add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct32x32_rd sse2/; + + add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct32x32_1/; + } } else { # Force C versions if CONFIG_EMULATE_HARDWARE is 1 if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") { diff --git a/vp10/common/x86/idct_intrin_sse2.c b/vp10/common/x86/idct_intrin_sse2.c index d58e26358..a2c674b80 100644 --- a/vp10/common/x86/idct_intrin_sse2.c +++ b/vp10/common/x86/idct_intrin_sse2.c @@ -12,14 +12,14 @@ #include "vpx_dsp/x86/txfm_common_sse2.h" #include "vpx_ports/mem.h" -void vp10_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, - int tx_type) { +void vp10_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { __m128i in[2]; const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); - in[0] = _mm_loadu_si128((const __m128i *)(input)); - in[1] = _mm_loadu_si128((const __m128i *)(input + 8)); + in[0] = load_input_data(input); + in[1] = load_input_data(input + 8); switch (tx_type) { case 0: // DCT_DCT @@ -77,21 +77,21 @@ void vp10_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, } } -void vp10_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, - int tx_type) { +void vp10_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { __m128i in[8]; const __m128i zero = _mm_setzero_si128(); const __m128i final_rounding = _mm_set1_epi16(1 << 4); // load input data - in[0] = _mm_load_si128((const __m128i *)input); - in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1)); - in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2)); - in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3)); - in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4)); - in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5)); - in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6)); - in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7)); + in[0] = load_input_data(input); + in[1] = load_input_data(input + 8 * 1); + in[2] = load_input_data(input + 8 * 2); + in[3] = load_input_data(input + 8 * 3); + in[4] = load_input_data(input + 8 * 4); + in[5] = load_input_data(input + 8 * 5); + in[6] = load_input_data(input + 8 * 6); + in[7] = load_input_data(input + 8 * 7); switch (tx_type) { case 0: // DCT_DCT @@ -144,8 +144,8 @@ void vp10_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, RECON_AND_STORE(dest + 7 * stride, in[7]); } -void vp10_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, - int tx_type) { +void vp10_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { __m128i in0[16], in1[16]; load_buffer_8x16(input, in0);