From a6a4659bea0859dfa663cd8b0e85cb7c7bf3a50e Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Mon, 27 Jul 2015 16:05:15 -0700 Subject: [PATCH] Factor 32x32 fwd DCT to vpx_dsp folder Move the 32x32 2D-DCT implementations from vp9/ to vpx_dsp/. Change-Id: Id3980696f8b69906ff7a59ff9fb2b9013d60047d --- test/dct32x32_test.cc | 5 +- vp9/common/vp9_rtcd_defs.pl | 18 - vp9/encoder/mips/msa/vp9_fdct32x32_msa.c | 923 ----------------- vp9/encoder/mips/msa/vp9_fdct_msa.h | 79 -- vp9/encoder/vp9_dct.c | 413 -------- vp9/encoder/x86/vp9_dct_sse2.c | 44 - vp9/vp9cx.mk | 3 - vpx_dsp/fwd_txfm.c | 413 ++++++++ vpx_dsp/fwd_txfm.h | 6 + vpx_dsp/mips/fwd_dct32x32_msa.c | 933 ++++++++++++++++++ vpx_dsp/mips/fwd_txfm_msa.h | 79 ++ vpx_dsp/vpx_dsp.mk | 4 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 18 + .../x86/fwd_dct32x32_impl_avx2.h | 2 - .../x86/fwd_dct32x32_impl_sse2.h | 6 +- .../x86/fwd_txfm_avx2.c | 9 +- vpx_dsp/x86/fwd_txfm_sse2.c | 25 +- 17 files changed, 1487 insertions(+), 1493 deletions(-) create mode 100644 vpx_dsp/mips/fwd_dct32x32_msa.c rename vp9/encoder/x86/vp9_dct32x32_avx2_impl.h => vpx_dsp/x86/fwd_dct32x32_impl_avx2.h (99%) rename vp9/encoder/x86/vp9_dct32x32_sse2_impl.h => vpx_dsp/x86/fwd_dct32x32_impl_sse2.h (99%) rename vp9/encoder/x86/vp9_dct_avx2.c => vpx_dsp/x86/fwd_txfm_avx2.c (75%) diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc index 25059a5d3..014658ed0 100644 --- a/test/dct32x32_test.cc +++ b/test/dct32x32_test.cc @@ -12,14 +12,15 @@ #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" #include "test/acm_random.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" +#include "third_party/googletest/src/include/gtest/gtest.h" -#include "./vpx_config.h" #include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vp9/common/vp9_entropy.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index f80d31ed7..a713f7feb 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -837,12 +837,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fdct32x32_1 sse2/; - - add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct32x32 sse2/; - - add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct32x32_rd sse2/; } else { add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; specialize qw/vp9_fht4x4 sse2 msa/; @@ -867,12 +861,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fdct32x32_1 sse2 msa/; - - add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct32x32 sse2 avx2 msa/; - - add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct32x32_rd sse2 avx2 msa/; } # @@ -935,12 +923,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_highbd_fdct32x32_1/; - add_proto qw/void vp9_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_highbd_fdct32x32 sse2/; - - add_proto qw/void vp9_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_highbd_fdct32x32_rd sse2/; - add_proto qw/void vp9_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; specialize qw/vp9_highbd_temporal_filter_apply/; diff --git a/vp9/encoder/mips/msa/vp9_fdct32x32_msa.c b/vp9/encoder/mips/msa/vp9_fdct32x32_msa.c index 17a3f94f6..81f2c3a48 100644 --- a/vp9/encoder/mips/msa/vp9_fdct32x32_msa.c +++ b/vp9/encoder/mips/msa/vp9_fdct32x32_msa.c @@ -8,695 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "./vp9_rtcd.h" #include "vp9/encoder/mips/msa/vp9_fdct_msa.h" -static void fdct8x32_1d_column_load_butterfly(const int16_t *input, - int32_t src_stride, - int16_t *temp_buff) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 step0, step1, step2, step3; - v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; - v8i16 step0_1, step1_1, step2_1, step3_1; - - /* 1st and 2nd set */ - LD_SH4(input, src_stride, in0, in1, in2, in3); - LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7); - LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); - LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); - SLLI_4V(in0, in1, in2, in3, 2); - SLLI_4V(in4, in5, in6, in7, 2); - SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); - SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); - BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, - step0, step1, step2, step3, in4, in5, in6, in7); - BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, - step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); - ST_SH4(step0, step1, step2, step3, temp_buff, 8); - ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8); - ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8); - ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8); - - /* 3rd and 4th set */ - LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3); - LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7); - LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); - LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); - SLLI_4V(in0, in1, in2, in3, 2); - SLLI_4V(in4, in5, in6, in7, 2); - SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); - SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); - BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, - step0, step1, step2, step3, in4, in5, in6, in7); - BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, - step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); - ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8); - ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8); - ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8); - ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8); -} - -static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in8, in9, in10, in11, in12, in13, in14, in15; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8i16 temp0, temp1; - - /* fdct even */ - LD_SH4(input, 8, in0, in1, in2, in3); - LD_SH4(input + 96, 8, in12, in13, in14, in15); - BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, - vec0, vec1, vec2, vec3, in12, in13, in14, in15); - LD_SH4(input + 32, 8, in4, in5, in6, in7); - LD_SH4(input + 64, 8, in8, in9, in10, in11); - BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, - vec4, vec5, vec6, vec7, in8, in9, in10, in11); - - /* Stage 3 */ - ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); - BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); - DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp); - ST_SH(temp1, temp + 512); - - DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 256); - ST_SH(temp1, temp + 768); - - SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4); - DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); - ADD2(vec4, vec5, vec7, vec6, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 128); - ST_SH(temp1, temp + 896); - - SUB2(vec4, vec5, vec7, vec6, vec4, vec7); - DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 640); - ST_SH(temp1, temp + 384); - - DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); - DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); - ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); - DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); - ADD2(in0, in1, in2, in3, vec0, vec7); - DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 64); - ST_SH(temp1, temp + 960); - - SUB2(in0, in1, in2, in3, in0, in2); - DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 576); - ST_SH(temp1, temp + 448); - - SUB2(in9, vec2, in14, vec5, vec2, vec5); - DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); - SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); - DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 320); - ST_SH(temp1, temp + 704); - - ADD2(in3, in2, in0, in1, vec3, vec4); - DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 192); - ST_SH(temp1, temp + 832); -} - -static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) { - v8i16 in16, in17, in18, in19, in20, in21, in22, in23; - v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; - - in20 = LD_SH(input + 32); - in21 = LD_SH(input + 40); - in26 = LD_SH(input + 80); - in27 = LD_SH(input + 88); - - DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); - DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); - - in18 = LD_SH(input + 16); - in19 = LD_SH(input + 24); - in28 = LD_SH(input + 96); - in29 = LD_SH(input + 104); - - vec4 = in19 - in20; - ST_SH(vec4, input + 32); - vec4 = in18 - in21; - ST_SH(vec4, input + 40); - vec4 = in29 - in26; - ST_SH(vec4, input + 80); - vec4 = in28 - in27; - ST_SH(vec4, input + 88); - - in21 = in18 + in21; - in20 = in19 + in20; - in27 = in28 + in27; - in26 = in29 + in26; - - LD_SH4(input + 48, 8, in22, in23, in24, in25); - DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); - DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); - - in16 = LD_SH(input); - in17 = LD_SH(input + 8); - in30 = LD_SH(input + 112); - in31 = LD_SH(input + 120); - - vec4 = in17 - in22; - ST_SH(vec4, input + 16); - vec4 = in16 - in23; - ST_SH(vec4, input + 24); - vec4 = in31 - in24; - ST_SH(vec4, input + 96); - vec4 = in30 - in25; - ST_SH(vec4, input + 104); - - ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); - DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); - DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); - ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); - DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); - ADD2(in27, in26, in25, in24, in23, in20); - DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec5, temp_ptr); - ST_SH(vec4, temp_ptr + 960); - - SUB2(in27, in26, in25, in24, in22, in21); - DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec5, temp_ptr + 448); - ST_SH(vec4, temp_ptr + 512); - - SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); - DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); - SUB2(in26, in27, in24, in25, in23, in20); - DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec4, temp_ptr + 704); - ST_SH(vec5, temp_ptr + 256); - - ADD2(in26, in27, in24, in25, in22, in21); - DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec4, temp_ptr + 192); - ST_SH(vec5, temp_ptr + 768); - - LD_SH4(input + 16, 8, in22, in23, in20, in21); - LD_SH4(input + 80, 8, in26, in27, in24, in25); - in16 = in20; - in17 = in21; - DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); - DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); - SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); - DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); - ADD2(in28, in29, in31, in30, in16, in19); - DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec5, temp_ptr + 832); - ST_SH(vec4, temp_ptr + 128); - - SUB2(in28, in29, in31, in30, in17, in18); - DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec5, temp_ptr + 320); - ST_SH(vec4, temp_ptr + 640); - ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); - DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); - SUB2(in29, in28, in30, in31, in16, in19); - DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec5, temp_ptr + 576); - ST_SH(vec4, temp_ptr + 384); - - ADD2(in29, in28, in30, in31, in17, in18); - DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec5, temp_ptr + 64); - ST_SH(vec4, temp_ptr + 896); -} - -static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride, - int16_t *tmp_buf, int16_t *tmp_buf_big) { - fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf); - fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big); - fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32)); -} - -static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff, - int16_t *output) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in8, in9, in10, in11, in12, in13, in14, in15; - v8i16 step0, step1, step2, step3, step4, step5, step6, step7; - - LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7); - LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); - TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, - in8, in9, in10, in11, in12, in13, in14, in15); - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, - in8, in9, in10, in11, in12, in13, in14, in15, - step0, step1, step2, step3, step4, step5, step6, step7, - in8, in9, in10, in11, in12, in13, in14, in15); - ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8); - ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8); - - /* 2nd set */ - LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7); - LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); - TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, - in8, in9, in10, in11, in12, in13, in14, in15); - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, - in8, in9, in10, in11, in12, in13, in14, in15, - step0, step1, step2, step3, step4, step5, step6, step7, - in8, in9, in10, in11, in12, in13, in14, in15); - ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, - (output + 8 * 8), 8); - ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8); -} - -static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, - int16_t *out) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in8, in9, in10, in11, in12, in13, in14, in15; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; - v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; - v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w; - - /* fdct32 even */ - /* stage 2 */ - LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); - LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); - - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, - in8, in9, in10, in11, in12, in13, in14, in15, - vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, - in8, in9, in10, in11, in12, in13, in14, in15); - ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8); - ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8); - - /* Stage 3 */ - UNPCK_SH_SW(vec0, vec0_l, vec0_r); - UNPCK_SH_SW(vec1, vec1_l, vec1_r); - UNPCK_SH_SW(vec2, vec2_l, vec2_r); - UNPCK_SH_SW(vec3, vec3_l, vec3_r); - UNPCK_SH_SW(vec4, vec4_l, vec4_r); - UNPCK_SH_SW(vec5, vec5_l, vec5_r); - UNPCK_SH_SW(vec6, vec6_l, vec6_r); - UNPCK_SH_SW(vec7, vec7_l, vec7_r); - ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, - tmp0_w, tmp1_w, tmp2_w, tmp3_w); - BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r); - ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, - vec0_r, vec1_r, vec2_r, vec3_r); - - tmp3_w = vec0_r + vec3_r; - vec0_r = vec0_r - vec3_r; - vec3_r = vec1_r + vec2_r; - vec1_r = vec1_r - vec2_r; - - DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, - cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r); - FDCT32_POSTPROC_NEG_W(vec4_r); - FDCT32_POSTPROC_NEG_W(tmp3_w); - FDCT32_POSTPROC_NEG_W(vec6_r); - FDCT32_POSTPROC_NEG_W(vec3_r); - PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); - ST_SH2(vec5, vec4, out, 8); - - DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, - cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r); - FDCT32_POSTPROC_NEG_W(vec4_r); - FDCT32_POSTPROC_NEG_W(tmp3_w); - FDCT32_POSTPROC_NEG_W(vec6_r); - FDCT32_POSTPROC_NEG_W(vec3_r); - PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); - ST_SH2(vec5, vec4, out + 16, 8); - - LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); - SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); - DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); - ADD2(vec4, vec5, vec7, vec6, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); - FDCT_POSTPROC_2V_NEG_H(in4, in5); - ST_SH(in4, out + 32); - ST_SH(in5, out + 56); - - SUB2(vec4, vec5, vec7, vec6, vec4, vec7); - DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); - FDCT_POSTPROC_2V_NEG_H(in4, in5); - ST_SH(in4, out + 40); - ST_SH(in5, out + 48); - - LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); - DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); - DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); - ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); - DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); - ADD2(in0, in1, in2, in3, vec0, vec7); - DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); - FDCT_POSTPROC_2V_NEG_H(in4, in5); - ST_SH(in4, out + 64); - ST_SH(in5, out + 120); - - SUB2(in0, in1, in2, in3, in0, in2); - DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); - FDCT_POSTPROC_2V_NEG_H(in4, in5); - ST_SH(in4, out + 72); - ST_SH(in5, out + 112); - - SUB2(in9, vec2, in14, vec5, vec2, vec5); - DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); - SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); - DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); - FDCT_POSTPROC_2V_NEG_H(in4, in5); - ST_SH(in4, out + 80); - ST_SH(in5, out + 104); - - ADD2(in3, in2, in0, in1, vec3, vec4); - DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); - FDCT_POSTPROC_2V_NEG_H(in4, in5); - ST_SH(in4, out + 96); - ST_SH(in5, out + 88); -} - -static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in8, in9, in10, in11, in12, in13, in14, in15; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; - - /* fdct32 even */ - /* stage 2 */ - LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); - LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); - - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, - in8, in9, in10, in11, in12, in13, in14, in15, - vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, - in8, in9, in10, in11, in12, in13, in14, in15); - - /* Stage 3 */ - ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); - BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); - DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out); - ST_SH(temp1, out + 8); - - DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 16); - ST_SH(temp1, out + 24); - - SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); - DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); - ADD2(vec4, vec5, vec7, vec6, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 32); - ST_SH(temp1, out + 56); - - SUB2(vec4, vec5, vec7, vec6, vec4, vec7); - DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 40); - ST_SH(temp1, out + 48); - - DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); - DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); - ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); - DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); - ADD2(in0, in1, in2, in3, vec0, vec7); - DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 64); - ST_SH(temp1, out + 120); - - SUB2(in0, in1, in2, in3, in0, in2); - DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 72); - ST_SH(temp1, out + 112); - - SUB2(in9, vec2, in14, vec5, vec2, vec5); - DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); - SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5) - DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 80); - ST_SH(temp1, out + 104); - - ADD2(in3, in2, in0, in1, vec3, vec4); - DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 96); - ST_SH(temp1, out + 88); -} - -static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr, - int16_t *out) { - v8i16 in16, in17, in18, in19, in20, in21, in22, in23; - v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; - - in20 = LD_SH(temp + 32); - in21 = LD_SH(temp + 40); - in26 = LD_SH(temp + 80); - in27 = LD_SH(temp + 88); - - DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); - DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); - - in18 = LD_SH(temp + 16); - in19 = LD_SH(temp + 24); - in28 = LD_SH(temp + 96); - in29 = LD_SH(temp + 104); - - vec4 = in19 - in20; - ST_SH(vec4, interm_ptr + 32); - vec4 = in18 - in21; - ST_SH(vec4, interm_ptr + 88); - vec4 = in28 - in27; - ST_SH(vec4, interm_ptr + 56); - vec4 = in29 - in26; - ST_SH(vec4, interm_ptr + 64); - - ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); - - in22 = LD_SH(temp + 48); - in23 = LD_SH(temp + 56); - in24 = LD_SH(temp + 64); - in25 = LD_SH(temp + 72); - - DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); - DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); - - in16 = LD_SH(temp); - in17 = LD_SH(temp + 8); - in30 = LD_SH(temp + 112); - in31 = LD_SH(temp + 120); - - vec4 = in17 - in22; - ST_SH(vec4, interm_ptr + 40); - vec4 = in30 - in25; - ST_SH(vec4, interm_ptr + 48); - vec4 = in31 - in24; - ST_SH(vec4, interm_ptr + 72); - vec4 = in16 - in23; - ST_SH(vec4, interm_ptr + 80); - - ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); - DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); - DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); - - ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); - DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); - ADD2(in27, in26, in25, in24, in23, in20); - - DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec5, out); - ST_SH(vec4, out + 120); - - SUB2(in27, in26, in25, in24, in22, in21); - - DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec5, out + 112); - ST_SH(vec4, out + 8); - - SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); - DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); - SUB2(in26, in27, in24, in25, in23, in20); - - DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec4, out + 16); - ST_SH(vec5, out + 104); - - ADD2(in26, in27, in24, in25, in22, in21); - DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec4, out + 24); - ST_SH(vec5, out + 96); - - in20 = LD_SH(interm_ptr + 32); - in21 = LD_SH(interm_ptr + 88); - in27 = LD_SH(interm_ptr + 56); - in26 = LD_SH(interm_ptr + 64); - - in16 = in20; - in17 = in21; - DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); - DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); - - in22 = LD_SH(interm_ptr + 40); - in25 = LD_SH(interm_ptr + 48); - in24 = LD_SH(interm_ptr + 72); - in23 = LD_SH(interm_ptr + 80); - - SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); - DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); - ADD2(in28, in29, in31, in30, in16, in19); - DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec5, out + 32); - ST_SH(vec4, out + 88); - - SUB2(in28, in29, in31, in30, in17, in18); - DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec5, out + 40); - ST_SH(vec4, out + 80); - - ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); - DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); - SUB2(in29, in28, in30, in31, in16, in19); - - DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec5, out + 72); - ST_SH(vec4, out + 48); - - ADD2(in29, in28, in30, in31, in17, in18); - - DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec4, out + 56); - ST_SH(vec5, out + 64); -} - -static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; - - /* 1st set */ - in0 = LD_SH(temp); - in4 = LD_SH(temp + 32); - in2 = LD_SH(temp + 64); - in6 = LD_SH(temp + 96); - in1 = LD_SH(temp + 128); - in7 = LD_SH(temp + 152); - in3 = LD_SH(temp + 192); - in5 = LD_SH(temp + 216); - - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); - - /* 2nd set */ - in0_1 = LD_SH(temp + 16); - in1_1 = LD_SH(temp + 232); - in2_1 = LD_SH(temp + 80); - in3_1 = LD_SH(temp + 168); - in4_1 = LD_SH(temp + 48); - in5_1 = LD_SH(temp + 176); - in6_1 = LD_SH(temp + 112); - in7_1 = LD_SH(temp + 240); - - ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32); - TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, - in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); - - /* 3rd set */ - in0 = LD_SH(temp + 8); - in1 = LD_SH(temp + 136); - in2 = LD_SH(temp + 72); - in3 = LD_SH(temp + 200); - in4 = LD_SH(temp + 40); - in5 = LD_SH(temp + 208); - in6 = LD_SH(temp + 104); - in7 = LD_SH(temp + 144); - - ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, - output + 8, 32); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); - ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32); - - /* 4th set */ - in0_1 = LD_SH(temp + 24); - in1_1 = LD_SH(temp + 224); - in2_1 = LD_SH(temp + 88); - in3_1 = LD_SH(temp + 160); - in4_1 = LD_SH(temp + 56); - in5_1 = LD_SH(temp + 184); - in6_1 = LD_SH(temp + 120); - in7_1 = LD_SH(temp + 248); - - TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, - in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); - ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, - output + 24, 32); -} - -static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, - int16_t *output) { - fdct8x32_1d_row_load_butterfly(temp, temp_buf); - fdct8x32_1d_row_even(temp_buf, temp_buf); - fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128); - fdct8x32_1d_row_transpose_store(temp_buf, output); -} - -static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf, - int16_t *output) { - fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); - fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf); - fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128); - fdct8x32_1d_row_transpose_store(tmp_buf, output); -} - -void vp9_fdct32x32_msa(const int16_t *input, int16_t *output, - int32_t src_stride) { - int32_t i; - DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); - DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); - - /* column transform */ - for (i = 0; i < 4; ++i) { - fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf, - tmp_buf_big + (8 * i)); - } - - /* row transform */ - fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output); - - /* row transform */ - for (i = 1; i < 4; ++i) { - fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256)); - } -} - void vp9_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) { out[1] = 0; @@ -718,239 +31,3 @@ void vp9_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) { out[0] += LD_HADD(input + 32 * 24 + 24, stride); out[0] >>= 3; } - -static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in8, in9, in10, in11, in12, in13, in14, in15; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; - - /* fdct32 even */ - /* stage 2 */ - LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); - LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); - - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, - in8, in9, in10, in11, in12, in13, in14, in15, - vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, - in8, in9, in10, in11, in12, in13, in14, in15); - FDCT_POSTPROC_2V_NEG_H(vec0, vec1); - FDCT_POSTPROC_2V_NEG_H(vec2, vec3); - FDCT_POSTPROC_2V_NEG_H(vec4, vec5); - FDCT_POSTPROC_2V_NEG_H(vec6, vec7); - FDCT_POSTPROC_2V_NEG_H(in8, in9); - FDCT_POSTPROC_2V_NEG_H(in10, in11); - FDCT_POSTPROC_2V_NEG_H(in12, in13); - FDCT_POSTPROC_2V_NEG_H(in14, in15); - - /* Stage 3 */ - ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); - - temp0 = in0 + in3; - in0 = in0 - in3; - in3 = in1 + in2; - in1 = in1 - in2; - - DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0); - ST_SH(temp0, out); - ST_SH(temp1, out + 8); - - DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); - ST_SH(temp0, out + 16); - ST_SH(temp1, out + 24); - - SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); - DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); - ADD2(vec4, vec5, vec7, vec6, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); - ST_SH(temp0, out + 32); - ST_SH(temp1, out + 56); - - SUB2(vec4, vec5, vec7, vec6, vec4, vec7); - DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); - ST_SH(temp0, out + 40); - ST_SH(temp1, out + 48); - - DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); - DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); - ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); - DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); - ADD2(in0, in1, in2, in3, vec0, vec7); - DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); - ST_SH(temp0, out + 64); - ST_SH(temp1, out + 120); - - SUB2(in0, in1, in2, in3, in0, in2); - DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); - ST_SH(temp0, out + 72); - ST_SH(temp1, out + 112); - - SUB2(in9, vec2, in14, vec5, vec2, vec5); - DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); - SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); - DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); - ST_SH(temp0, out + 80); - ST_SH(temp1, out + 104); - - ADD2(in3, in2, in0, in1, vec3, vec4); - DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); - ST_SH(temp0, out + 96); - ST_SH(temp1, out + 88); -} - -static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr, - int16_t *out) { - v8i16 in16, in17, in18, in19, in20, in21, in22, in23; - v8i16 in24, in25, in26, in27, in28, in29, in30, in31; - v8i16 vec4, vec5; - - in20 = LD_SH(temp + 32); - in21 = LD_SH(temp + 40); - in26 = LD_SH(temp + 80); - in27 = LD_SH(temp + 88); - - DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); - DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); - - FDCT_POSTPROC_2V_NEG_H(in20, in21); - FDCT_POSTPROC_2V_NEG_H(in26, in27); - - in18 = LD_SH(temp + 16); - in19 = LD_SH(temp + 24); - in28 = LD_SH(temp + 96); - in29 = LD_SH(temp + 104); - - FDCT_POSTPROC_2V_NEG_H(in18, in19); - FDCT_POSTPROC_2V_NEG_H(in28, in29); - - vec4 = in19 - in20; - ST_SH(vec4, interm_ptr + 32); - vec4 = in18 - in21; - ST_SH(vec4, interm_ptr + 88); - vec4 = in29 - in26; - ST_SH(vec4, interm_ptr + 64); - vec4 = in28 - in27; - ST_SH(vec4, interm_ptr + 56); - - ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); - - in22 = LD_SH(temp + 48); - in23 = LD_SH(temp + 56); - in24 = LD_SH(temp + 64); - in25 = LD_SH(temp + 72); - - DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); - DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); - FDCT_POSTPROC_2V_NEG_H(in22, in23); - FDCT_POSTPROC_2V_NEG_H(in24, in25); - - in16 = LD_SH(temp); - in17 = LD_SH(temp + 8); - in30 = LD_SH(temp + 112); - in31 = LD_SH(temp + 120); - - FDCT_POSTPROC_2V_NEG_H(in16, in17); - FDCT_POSTPROC_2V_NEG_H(in30, in31); - - vec4 = in17 - in22; - ST_SH(vec4, interm_ptr + 40); - vec4 = in30 - in25; - ST_SH(vec4, interm_ptr + 48); - vec4 = in31 - in24; - ST_SH(vec4, interm_ptr + 72); - vec4 = in16 - in23; - ST_SH(vec4, interm_ptr + 80); - - ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); - DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); - DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); - ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); - DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); - ADD2(in27, in26, in25, in24, in23, in20); - DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); - ST_SH(vec5, out); - ST_SH(vec4, out + 120); - - SUB2(in27, in26, in25, in24, in22, in21); - DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); - ST_SH(vec5, out + 112); - ST_SH(vec4, out + 8); - - SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); - DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); - SUB2(in26, in27, in24, in25, in23, in20); - DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); - ST_SH(vec4, out + 16); - ST_SH(vec5, out + 104); - - ADD2(in26, in27, in24, in25, in22, in21); - DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); - ST_SH(vec4, out + 24); - ST_SH(vec5, out + 96); - - in20 = LD_SH(interm_ptr + 32); - in21 = LD_SH(interm_ptr + 88); - in27 = LD_SH(interm_ptr + 56); - in26 = LD_SH(interm_ptr + 64); - - in16 = in20; - in17 = in21; - DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); - DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); - - in22 = LD_SH(interm_ptr + 40); - in25 = LD_SH(interm_ptr + 48); - in24 = LD_SH(interm_ptr + 72); - in23 = LD_SH(interm_ptr + 80); - - SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); - DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); - in16 = in28 + in29; - in19 = in31 + in30; - DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); - ST_SH(vec5, out + 32); - ST_SH(vec4, out + 88); - - SUB2(in28, in29, in31, in30, in17, in18); - DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); - ST_SH(vec5, out + 40); - ST_SH(vec4, out + 80); - - ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); - DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); - SUB2(in29, in28, in30, in31, in16, in19); - DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); - ST_SH(vec5, out + 72); - ST_SH(vec4, out + 48); - - ADD2(in29, in28, in30, in31, in17, in18); - DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); - ST_SH(vec4, out + 56); - ST_SH(vec5, out + 64); -} - -static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf, - int16_t *output) { - fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); - fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf); - fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128)); - fdct8x32_1d_row_transpose_store(tmp_buf, output); -} - -void vp9_fdct32x32_rd_msa(const int16_t *input, int16_t *out, - int32_t src_stride) { - int32_t i; - DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); - DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); - - /* column transform */ - for (i = 0; i < 4; ++i) { - fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0], - &tmp_buf_big[0] + (8 * i)); - } - - /* row transform */ - for (i = 0; i < 4; ++i) { - fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0], - out + (8 * i * 32)); - } -} diff --git a/vp9/encoder/mips/msa/vp9_fdct_msa.h b/vp9/encoder/mips/msa/vp9_fdct_msa.h index 425ba9db6..504d36154 100644 --- a/vp9/encoder/mips/msa/vp9_fdct_msa.h +++ b/vp9/encoder/mips/msa/vp9_fdct_msa.h @@ -96,22 +96,6 @@ HADD_SW_S32(vec_w_m); \ }) -#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) { \ - v8i16 tp0_m, tp1_m; \ - v8i16 one_m = __msa_ldi_h(1); \ - \ - tp0_m = __msa_clti_s_h(vec0, 0); \ - tp1_m = __msa_clti_s_h(vec1, 0); \ - vec0 += 1; \ - vec1 += 1; \ - tp0_m = one_m & tp0_m; \ - tp1_m = one_m & tp1_m; \ - vec0 += tp0_m; \ - vec1 += tp1_m; \ - vec0 >>= 2; \ - vec1 >>= 2; \ -} - #define VP9_FADST4(in0, in1, in2, in3, out0, out1, out2, out3) { \ v4i32 s0_m, s1_m, s2_m, s3_m, constant_m; \ v4i32 in0_r_m, in1_r_m, in2_r_m, in3_r_m; \ @@ -145,67 +129,4 @@ PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m, \ s3_m, s3_m, out0, out1, out2, out3); \ } - -#define FDCT32_POSTPROC_NEG_W(vec) { \ - v4i32 temp_m; \ - v4i32 one_m = __msa_ldi_w(1); \ - \ - temp_m = __msa_clti_s_w(vec, 0); \ - vec += 1; \ - temp_m = one_m & temp_m; \ - vec += temp_m; \ - vec >>= 2; \ -} - -#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) { \ - v8i16 tp0_m, tp1_m; \ - v8i16 one = __msa_ldi_h(1); \ - \ - tp0_m = __msa_clei_s_h(vec0, 0); \ - tp1_m = __msa_clei_s_h(vec1, 0); \ - tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \ - tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \ - vec0 += 1; \ - vec1 += 1; \ - tp0_m = one & tp0_m; \ - tp1_m = one & tp1_m; \ - vec0 += tp0_m; \ - vec1 += tp1_m; \ - vec0 >>= 2; \ - vec1 >>= 2; \ -} - -#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, \ - reg1_right, const0, const1, \ - out0, out1, out2, out3) { \ - v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ - v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \ - v4i32 k0_m = __msa_fill_w((int32_t) const0); \ - \ - s0_m = __msa_fill_w((int32_t) const1); \ - k0_m = __msa_ilvev_w(s0_m, k0_m); \ - \ - ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \ - ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m); \ - ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m); \ - ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m); \ - \ - DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m); \ - DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m); \ - tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ - tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ - tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ - tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ - out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ - out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ - \ - DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m); \ - DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m); \ - tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ - tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ - tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ - tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ - out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ - out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ -} #endif /* VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ */ diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index 5dcbd1f7c..9f1c74015 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -822,410 +822,6 @@ void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, } } -static INLINE tran_high_t dct_32_round(tran_high_t input) { - tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); - // TODO(debargha, peter.derivaz): Find new bounds for this assert, - // and make the bounds consts. - // assert(-131072 <= rv && rv <= 131071); - return rv; -} - -static INLINE tran_high_t half_round_shift(tran_high_t input) { - tran_high_t rv = (input + 1 + (input < 0)) >> 2; - return rv; -} - -void vp9_fdct32(const tran_high_t *input, tran_high_t *output, int round) { - tran_high_t step[32]; - // Stage 1 - step[0] = input[0] + input[(32 - 1)]; - step[1] = input[1] + input[(32 - 2)]; - step[2] = input[2] + input[(32 - 3)]; - step[3] = input[3] + input[(32 - 4)]; - step[4] = input[4] + input[(32 - 5)]; - step[5] = input[5] + input[(32 - 6)]; - step[6] = input[6] + input[(32 - 7)]; - step[7] = input[7] + input[(32 - 8)]; - step[8] = input[8] + input[(32 - 9)]; - step[9] = input[9] + input[(32 - 10)]; - step[10] = input[10] + input[(32 - 11)]; - step[11] = input[11] + input[(32 - 12)]; - step[12] = input[12] + input[(32 - 13)]; - step[13] = input[13] + input[(32 - 14)]; - step[14] = input[14] + input[(32 - 15)]; - step[15] = input[15] + input[(32 - 16)]; - step[16] = -input[16] + input[(32 - 17)]; - step[17] = -input[17] + input[(32 - 18)]; - step[18] = -input[18] + input[(32 - 19)]; - step[19] = -input[19] + input[(32 - 20)]; - step[20] = -input[20] + input[(32 - 21)]; - step[21] = -input[21] + input[(32 - 22)]; - step[22] = -input[22] + input[(32 - 23)]; - step[23] = -input[23] + input[(32 - 24)]; - step[24] = -input[24] + input[(32 - 25)]; - step[25] = -input[25] + input[(32 - 26)]; - step[26] = -input[26] + input[(32 - 27)]; - step[27] = -input[27] + input[(32 - 28)]; - step[28] = -input[28] + input[(32 - 29)]; - step[29] = -input[29] + input[(32 - 30)]; - step[30] = -input[30] + input[(32 - 31)]; - step[31] = -input[31] + input[(32 - 32)]; - - // Stage 2 - output[0] = step[0] + step[16 - 1]; - output[1] = step[1] + step[16 - 2]; - output[2] = step[2] + step[16 - 3]; - output[3] = step[3] + step[16 - 4]; - output[4] = step[4] + step[16 - 5]; - output[5] = step[5] + step[16 - 6]; - output[6] = step[6] + step[16 - 7]; - output[7] = step[7] + step[16 - 8]; - output[8] = -step[8] + step[16 - 9]; - output[9] = -step[9] + step[16 - 10]; - output[10] = -step[10] + step[16 - 11]; - output[11] = -step[11] + step[16 - 12]; - output[12] = -step[12] + step[16 - 13]; - output[13] = -step[13] + step[16 - 14]; - output[14] = -step[14] + step[16 - 15]; - output[15] = -step[15] + step[16 - 16]; - - output[16] = step[16]; - output[17] = step[17]; - output[18] = step[18]; - output[19] = step[19]; - - output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64); - output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64); - output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64); - output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64); - - output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64); - output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64); - output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64); - output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64); - - output[28] = step[28]; - output[29] = step[29]; - output[30] = step[30]; - output[31] = step[31]; - - // dump the magnitude by 4, hence the intermediate values are within - // the range of 16 bits. - if (round) { - output[0] = half_round_shift(output[0]); - output[1] = half_round_shift(output[1]); - output[2] = half_round_shift(output[2]); - output[3] = half_round_shift(output[3]); - output[4] = half_round_shift(output[4]); - output[5] = half_round_shift(output[5]); - output[6] = half_round_shift(output[6]); - output[7] = half_round_shift(output[7]); - output[8] = half_round_shift(output[8]); - output[9] = half_round_shift(output[9]); - output[10] = half_round_shift(output[10]); - output[11] = half_round_shift(output[11]); - output[12] = half_round_shift(output[12]); - output[13] = half_round_shift(output[13]); - output[14] = half_round_shift(output[14]); - output[15] = half_round_shift(output[15]); - - output[16] = half_round_shift(output[16]); - output[17] = half_round_shift(output[17]); - output[18] = half_round_shift(output[18]); - output[19] = half_round_shift(output[19]); - output[20] = half_round_shift(output[20]); - output[21] = half_round_shift(output[21]); - output[22] = half_round_shift(output[22]); - output[23] = half_round_shift(output[23]); - output[24] = half_round_shift(output[24]); - output[25] = half_round_shift(output[25]); - output[26] = half_round_shift(output[26]); - output[27] = half_round_shift(output[27]); - output[28] = half_round_shift(output[28]); - output[29] = half_round_shift(output[29]); - output[30] = half_round_shift(output[30]); - output[31] = half_round_shift(output[31]); - } - - // Stage 3 - step[0] = output[0] + output[(8 - 1)]; - step[1] = output[1] + output[(8 - 2)]; - step[2] = output[2] + output[(8 - 3)]; - step[3] = output[3] + output[(8 - 4)]; - step[4] = -output[4] + output[(8 - 5)]; - step[5] = -output[5] + output[(8 - 6)]; - step[6] = -output[6] + output[(8 - 7)]; - step[7] = -output[7] + output[(8 - 8)]; - step[8] = output[8]; - step[9] = output[9]; - step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64); - step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64); - step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64); - step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64); - step[14] = output[14]; - step[15] = output[15]; - - step[16] = output[16] + output[23]; - step[17] = output[17] + output[22]; - step[18] = output[18] + output[21]; - step[19] = output[19] + output[20]; - step[20] = -output[20] + output[19]; - step[21] = -output[21] + output[18]; - step[22] = -output[22] + output[17]; - step[23] = -output[23] + output[16]; - step[24] = -output[24] + output[31]; - step[25] = -output[25] + output[30]; - step[26] = -output[26] + output[29]; - step[27] = -output[27] + output[28]; - step[28] = output[28] + output[27]; - step[29] = output[29] + output[26]; - step[30] = output[30] + output[25]; - step[31] = output[31] + output[24]; - - // Stage 4 - output[0] = step[0] + step[3]; - output[1] = step[1] + step[2]; - output[2] = -step[2] + step[1]; - output[3] = -step[3] + step[0]; - output[4] = step[4]; - output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64); - output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64); - output[7] = step[7]; - output[8] = step[8] + step[11]; - output[9] = step[9] + step[10]; - output[10] = -step[10] + step[9]; - output[11] = -step[11] + step[8]; - output[12] = -step[12] + step[15]; - output[13] = -step[13] + step[14]; - output[14] = step[14] + step[13]; - output[15] = step[15] + step[12]; - - output[16] = step[16]; - output[17] = step[17]; - output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64); - output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64); - output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64); - output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64); - output[22] = step[22]; - output[23] = step[23]; - output[24] = step[24]; - output[25] = step[25]; - output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64); - output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64); - output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64); - output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64); - output[30] = step[30]; - output[31] = step[31]; - - // Stage 5 - step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64); - step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64); - step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64); - step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64); - step[4] = output[4] + output[5]; - step[5] = -output[5] + output[4]; - step[6] = -output[6] + output[7]; - step[7] = output[7] + output[6]; - step[8] = output[8]; - step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64); - step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64); - step[11] = output[11]; - step[12] = output[12]; - step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64); - step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64); - step[15] = output[15]; - - step[16] = output[16] + output[19]; - step[17] = output[17] + output[18]; - step[18] = -output[18] + output[17]; - step[19] = -output[19] + output[16]; - step[20] = -output[20] + output[23]; - step[21] = -output[21] + output[22]; - step[22] = output[22] + output[21]; - step[23] = output[23] + output[20]; - step[24] = output[24] + output[27]; - step[25] = output[25] + output[26]; - step[26] = -output[26] + output[25]; - step[27] = -output[27] + output[24]; - step[28] = -output[28] + output[31]; - step[29] = -output[29] + output[30]; - step[30] = output[30] + output[29]; - step[31] = output[31] + output[28]; - - // Stage 6 - output[0] = step[0]; - output[1] = step[1]; - output[2] = step[2]; - output[3] = step[3]; - output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64); - output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64); - output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64); - output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64); - output[8] = step[8] + step[9]; - output[9] = -step[9] + step[8]; - output[10] = -step[10] + step[11]; - output[11] = step[11] + step[10]; - output[12] = step[12] + step[13]; - output[13] = -step[13] + step[12]; - output[14] = -step[14] + step[15]; - output[15] = step[15] + step[14]; - - output[16] = step[16]; - output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64); - output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64); - output[19] = step[19]; - output[20] = step[20]; - output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64); - output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64); - output[23] = step[23]; - output[24] = step[24]; - output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64); - output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64); - output[27] = step[27]; - output[28] = step[28]; - output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64); - output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64); - output[31] = step[31]; - - // Stage 7 - step[0] = output[0]; - step[1] = output[1]; - step[2] = output[2]; - step[3] = output[3]; - step[4] = output[4]; - step[5] = output[5]; - step[6] = output[6]; - step[7] = output[7]; - step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64); - step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64); - step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64); - step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64); - step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64); - step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64); - step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64); - step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64); - - step[16] = output[16] + output[17]; - step[17] = -output[17] + output[16]; - step[18] = -output[18] + output[19]; - step[19] = output[19] + output[18]; - step[20] = output[20] + output[21]; - step[21] = -output[21] + output[20]; - step[22] = -output[22] + output[23]; - step[23] = output[23] + output[22]; - step[24] = output[24] + output[25]; - step[25] = -output[25] + output[24]; - step[26] = -output[26] + output[27]; - step[27] = output[27] + output[26]; - step[28] = output[28] + output[29]; - step[29] = -output[29] + output[28]; - step[30] = -output[30] + output[31]; - step[31] = output[31] + output[30]; - - // Final stage --- outputs indices are bit-reversed. - output[0] = step[0]; - output[16] = step[1]; - output[8] = step[2]; - output[24] = step[3]; - output[4] = step[4]; - output[20] = step[5]; - output[12] = step[6]; - output[28] = step[7]; - output[2] = step[8]; - output[18] = step[9]; - output[10] = step[10]; - output[26] = step[11]; - output[6] = step[12]; - output[22] = step[13]; - output[14] = step[14]; - output[30] = step[15]; - - output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64); - output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64); - output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64); - output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64); - output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64); - output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64); - output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64); - output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64); - output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); - output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); - output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); - output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); - output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); - output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); - output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); - output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); -} - -void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) { - int r, c; - tran_low_t sum = 0; - for (r = 0; r < 32; ++r) - for (c = 0; c < 32; ++c) - sum += input[r * stride + c]; - - output[0] = sum >> 3; - output[1] = 0; -} - -void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { - int i, j; - tran_high_t output[32 * 32]; - - // Columns - for (i = 0; i < 32; ++i) { - tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) - temp_in[j] = input[j * stride + i] * 4; - vp9_fdct32(temp_in, temp_out, 0); - for (j = 0; j < 32; ++j) - output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; - } - - // Rows - for (i = 0; i < 32; ++i) { - tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) - temp_in[j] = output[j + i * 32]; - vp9_fdct32(temp_in, temp_out, 0); - for (j = 0; j < 32; ++j) - out[j + i * 32] = - (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); - } -} - -// Note that although we use dct_32_round in dct32 computation flow, -// this 2d fdct32x32 for rate-distortion optimization loop is operating -// within 16 bits precision. -void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { - int i, j; - tran_high_t output[32 * 32]; - - // Columns - for (i = 0; i < 32; ++i) { - tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) - temp_in[j] = input[j * stride + i] * 4; - vp9_fdct32(temp_in, temp_out, 0); - for (j = 0; j < 32; ++j) - // TODO(cd): see quality impact of only doing - // output[j * 32 + i] = (temp_out[j] + 1) >> 2; - // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c - output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; - } - - // Rows - for (i = 0; i < 32; ++i) { - tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) - temp_in[j] = output[j + i * 32]; - vp9_fdct32(temp_in, temp_out, 1); - for (j = 0; j < 32; ++j) - out[j + i * 32] = (tran_low_t)temp_out[j]; - } -} - #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type) { @@ -1261,13 +857,4 @@ void vp9_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out, int stride) { vp9_fdct32x32_1_c(input, out, stride); } - -void vp9_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { - vp9_fdct32x32_c(input, out, stride); -} - -void vp9_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, - int stride) { - vp9_fdct32x32_rd_c(input, out, stride); -} #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c index f263c8440..b39346080 100644 --- a/vp9/encoder/x86/vp9_dct_sse2.c +++ b/vp9/encoder/x86/vp9_dct_sse2.c @@ -2266,47 +2266,3 @@ void vp9_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, in1 = _mm_srai_epi32(in1, 3); store_output(&in1, output); } - -/* - * The DCTnxn functions are defined using the macros below. The main code for - * them is in separate files (vp9/encoder/x86/vp9_dct_sse2_impl.h & - * vp9/encoder/x86/vp9_dct32x32_sse2_impl.h) which are used by both the 8 bit code - * and the high bit depth code. - */ - -#define DCT_HIGH_BIT_DEPTH 0 - -#define FDCT32x32_2D vp9_fdct32x32_rd_sse2 -#define FDCT32x32_HIGH_PRECISION 0 -#include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h" -#undef FDCT32x32_2D -#undef FDCT32x32_HIGH_PRECISION - -#define FDCT32x32_2D vp9_fdct32x32_sse2 -#define FDCT32x32_HIGH_PRECISION 1 -#include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h" // NOLINT -#undef FDCT32x32_2D -#undef FDCT32x32_HIGH_PRECISION - -#undef DCT_HIGH_BIT_DEPTH - - -#if CONFIG_VP9_HIGHBITDEPTH - -#define DCT_HIGH_BIT_DEPTH 1 - -#define FDCT32x32_2D vp9_highbd_fdct32x32_rd_sse2 -#define FDCT32x32_HIGH_PRECISION 0 -#include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h" // NOLINT -#undef FDCT32x32_2D -#undef FDCT32x32_HIGH_PRECISION - -#define FDCT32x32_2D vp9_highbd_fdct32x32_sse2 -#define FDCT32x32_HIGH_PRECISION 1 -#include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h" // NOLINT -#undef FDCT32x32_2D -#undef FDCT32x32_HIGH_PRECISION - -#undef DCT_HIGH_BIT_DEPTH - -#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index bfd77c29b..c9278d2a6 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -118,14 +118,11 @@ VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2_impl.h ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c endif -VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2_impl.h -VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) diff --git a/vpx_dsp/fwd_txfm.c b/vpx_dsp/fwd_txfm.c index bdb55ee32..337b82e03 100644 --- a/vpx_dsp/fwd_txfm.c +++ b/vpx_dsp/fwd_txfm.c @@ -343,6 +343,410 @@ void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { } } +static INLINE tran_high_t dct_32_round(tran_high_t input) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + // TODO(debargha, peter.derivaz): Find new bounds for this assert, + // and make the bounds consts. + // assert(-131072 <= rv && rv <= 131071); + return rv; +} + +static INLINE tran_high_t half_round_shift(tran_high_t input) { + tran_high_t rv = (input + 1 + (input < 0)) >> 2; + return rv; +} + +void vp9_fdct32(const tran_high_t *input, tran_high_t *output, int round) { + tran_high_t step[32]; + // Stage 1 + step[0] = input[0] + input[(32 - 1)]; + step[1] = input[1] + input[(32 - 2)]; + step[2] = input[2] + input[(32 - 3)]; + step[3] = input[3] + input[(32 - 4)]; + step[4] = input[4] + input[(32 - 5)]; + step[5] = input[5] + input[(32 - 6)]; + step[6] = input[6] + input[(32 - 7)]; + step[7] = input[7] + input[(32 - 8)]; + step[8] = input[8] + input[(32 - 9)]; + step[9] = input[9] + input[(32 - 10)]; + step[10] = input[10] + input[(32 - 11)]; + step[11] = input[11] + input[(32 - 12)]; + step[12] = input[12] + input[(32 - 13)]; + step[13] = input[13] + input[(32 - 14)]; + step[14] = input[14] + input[(32 - 15)]; + step[15] = input[15] + input[(32 - 16)]; + step[16] = -input[16] + input[(32 - 17)]; + step[17] = -input[17] + input[(32 - 18)]; + step[18] = -input[18] + input[(32 - 19)]; + step[19] = -input[19] + input[(32 - 20)]; + step[20] = -input[20] + input[(32 - 21)]; + step[21] = -input[21] + input[(32 - 22)]; + step[22] = -input[22] + input[(32 - 23)]; + step[23] = -input[23] + input[(32 - 24)]; + step[24] = -input[24] + input[(32 - 25)]; + step[25] = -input[25] + input[(32 - 26)]; + step[26] = -input[26] + input[(32 - 27)]; + step[27] = -input[27] + input[(32 - 28)]; + step[28] = -input[28] + input[(32 - 29)]; + step[29] = -input[29] + input[(32 - 30)]; + step[30] = -input[30] + input[(32 - 31)]; + step[31] = -input[31] + input[(32 - 32)]; + + // Stage 2 + output[0] = step[0] + step[16 - 1]; + output[1] = step[1] + step[16 - 2]; + output[2] = step[2] + step[16 - 3]; + output[3] = step[3] + step[16 - 4]; + output[4] = step[4] + step[16 - 5]; + output[5] = step[5] + step[16 - 6]; + output[6] = step[6] + step[16 - 7]; + output[7] = step[7] + step[16 - 8]; + output[8] = -step[8] + step[16 - 9]; + output[9] = -step[9] + step[16 - 10]; + output[10] = -step[10] + step[16 - 11]; + output[11] = -step[11] + step[16 - 12]; + output[12] = -step[12] + step[16 - 13]; + output[13] = -step[13] + step[16 - 14]; + output[14] = -step[14] + step[16 - 15]; + output[15] = -step[15] + step[16 - 16]; + + output[16] = step[16]; + output[17] = step[17]; + output[18] = step[18]; + output[19] = step[19]; + + output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64); + output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64); + output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64); + output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64); + + output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64); + output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64); + output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64); + output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64); + + output[28] = step[28]; + output[29] = step[29]; + output[30] = step[30]; + output[31] = step[31]; + + // dump the magnitude by 4, hence the intermediate values are within + // the range of 16 bits. + if (round) { + output[0] = half_round_shift(output[0]); + output[1] = half_round_shift(output[1]); + output[2] = half_round_shift(output[2]); + output[3] = half_round_shift(output[3]); + output[4] = half_round_shift(output[4]); + output[5] = half_round_shift(output[5]); + output[6] = half_round_shift(output[6]); + output[7] = half_round_shift(output[7]); + output[8] = half_round_shift(output[8]); + output[9] = half_round_shift(output[9]); + output[10] = half_round_shift(output[10]); + output[11] = half_round_shift(output[11]); + output[12] = half_round_shift(output[12]); + output[13] = half_round_shift(output[13]); + output[14] = half_round_shift(output[14]); + output[15] = half_round_shift(output[15]); + + output[16] = half_round_shift(output[16]); + output[17] = half_round_shift(output[17]); + output[18] = half_round_shift(output[18]); + output[19] = half_round_shift(output[19]); + output[20] = half_round_shift(output[20]); + output[21] = half_round_shift(output[21]); + output[22] = half_round_shift(output[22]); + output[23] = half_round_shift(output[23]); + output[24] = half_round_shift(output[24]); + output[25] = half_round_shift(output[25]); + output[26] = half_round_shift(output[26]); + output[27] = half_round_shift(output[27]); + output[28] = half_round_shift(output[28]); + output[29] = half_round_shift(output[29]); + output[30] = half_round_shift(output[30]); + output[31] = half_round_shift(output[31]); + } + + // Stage 3 + step[0] = output[0] + output[(8 - 1)]; + step[1] = output[1] + output[(8 - 2)]; + step[2] = output[2] + output[(8 - 3)]; + step[3] = output[3] + output[(8 - 4)]; + step[4] = -output[4] + output[(8 - 5)]; + step[5] = -output[5] + output[(8 - 6)]; + step[6] = -output[6] + output[(8 - 7)]; + step[7] = -output[7] + output[(8 - 8)]; + step[8] = output[8]; + step[9] = output[9]; + step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64); + step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64); + step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64); + step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64); + step[14] = output[14]; + step[15] = output[15]; + + step[16] = output[16] + output[23]; + step[17] = output[17] + output[22]; + step[18] = output[18] + output[21]; + step[19] = output[19] + output[20]; + step[20] = -output[20] + output[19]; + step[21] = -output[21] + output[18]; + step[22] = -output[22] + output[17]; + step[23] = -output[23] + output[16]; + step[24] = -output[24] + output[31]; + step[25] = -output[25] + output[30]; + step[26] = -output[26] + output[29]; + step[27] = -output[27] + output[28]; + step[28] = output[28] + output[27]; + step[29] = output[29] + output[26]; + step[30] = output[30] + output[25]; + step[31] = output[31] + output[24]; + + // Stage 4 + output[0] = step[0] + step[3]; + output[1] = step[1] + step[2]; + output[2] = -step[2] + step[1]; + output[3] = -step[3] + step[0]; + output[4] = step[4]; + output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64); + output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64); + output[7] = step[7]; + output[8] = step[8] + step[11]; + output[9] = step[9] + step[10]; + output[10] = -step[10] + step[9]; + output[11] = -step[11] + step[8]; + output[12] = -step[12] + step[15]; + output[13] = -step[13] + step[14]; + output[14] = step[14] + step[13]; + output[15] = step[15] + step[12]; + + output[16] = step[16]; + output[17] = step[17]; + output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64); + output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64); + output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64); + output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64); + output[22] = step[22]; + output[23] = step[23]; + output[24] = step[24]; + output[25] = step[25]; + output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64); + output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64); + output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64); + output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64); + output[30] = step[30]; + output[31] = step[31]; + + // Stage 5 + step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64); + step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64); + step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64); + step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64); + step[4] = output[4] + output[5]; + step[5] = -output[5] + output[4]; + step[6] = -output[6] + output[7]; + step[7] = output[7] + output[6]; + step[8] = output[8]; + step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64); + step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64); + step[11] = output[11]; + step[12] = output[12]; + step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64); + step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64); + step[15] = output[15]; + + step[16] = output[16] + output[19]; + step[17] = output[17] + output[18]; + step[18] = -output[18] + output[17]; + step[19] = -output[19] + output[16]; + step[20] = -output[20] + output[23]; + step[21] = -output[21] + output[22]; + step[22] = output[22] + output[21]; + step[23] = output[23] + output[20]; + step[24] = output[24] + output[27]; + step[25] = output[25] + output[26]; + step[26] = -output[26] + output[25]; + step[27] = -output[27] + output[24]; + step[28] = -output[28] + output[31]; + step[29] = -output[29] + output[30]; + step[30] = output[30] + output[29]; + step[31] = output[31] + output[28]; + + // Stage 6 + output[0] = step[0]; + output[1] = step[1]; + output[2] = step[2]; + output[3] = step[3]; + output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64); + output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64); + output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64); + output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64); + output[8] = step[8] + step[9]; + output[9] = -step[9] + step[8]; + output[10] = -step[10] + step[11]; + output[11] = step[11] + step[10]; + output[12] = step[12] + step[13]; + output[13] = -step[13] + step[12]; + output[14] = -step[14] + step[15]; + output[15] = step[15] + step[14]; + + output[16] = step[16]; + output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64); + output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64); + output[19] = step[19]; + output[20] = step[20]; + output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64); + output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64); + output[23] = step[23]; + output[24] = step[24]; + output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64); + output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64); + output[27] = step[27]; + output[28] = step[28]; + output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64); + output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64); + output[31] = step[31]; + + // Stage 7 + step[0] = output[0]; + step[1] = output[1]; + step[2] = output[2]; + step[3] = output[3]; + step[4] = output[4]; + step[5] = output[5]; + step[6] = output[6]; + step[7] = output[7]; + step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64); + step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64); + step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64); + step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64); + step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64); + step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64); + step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64); + step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64); + + step[16] = output[16] + output[17]; + step[17] = -output[17] + output[16]; + step[18] = -output[18] + output[19]; + step[19] = output[19] + output[18]; + step[20] = output[20] + output[21]; + step[21] = -output[21] + output[20]; + step[22] = -output[22] + output[23]; + step[23] = output[23] + output[22]; + step[24] = output[24] + output[25]; + step[25] = -output[25] + output[24]; + step[26] = -output[26] + output[27]; + step[27] = output[27] + output[26]; + step[28] = output[28] + output[29]; + step[29] = -output[29] + output[28]; + step[30] = -output[30] + output[31]; + step[31] = output[31] + output[30]; + + // Final stage --- outputs indices are bit-reversed. + output[0] = step[0]; + output[16] = step[1]; + output[8] = step[2]; + output[24] = step[3]; + output[4] = step[4]; + output[20] = step[5]; + output[12] = step[6]; + output[28] = step[7]; + output[2] = step[8]; + output[18] = step[9]; + output[10] = step[10]; + output[26] = step[11]; + output[6] = step[12]; + output[22] = step[13]; + output[14] = step[14]; + output[30] = step[15]; + + output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64); + output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64); + output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64); + output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64); + output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64); + output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64); + output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64); + output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64); + output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); + output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); + output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); + output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); + output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); + output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); + output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); + output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); +} + +void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) { + int r, c; + tran_low_t sum = 0; + for (r = 0; r < 32; ++r) + for (c = 0; c < 32; ++c) + sum += input[r * stride + c]; + + output[0] = sum >> 3; + output[1] = 0; +} + +void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { + int i, j; + tran_high_t output[32 * 32]; + + // Columns + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) + temp_in[j] = input[j * stride + i] * 4; + vp9_fdct32(temp_in, temp_out, 0); + for (j = 0; j < 32; ++j) + output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + } + + // Rows + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) + temp_in[j] = output[j + i * 32]; + vp9_fdct32(temp_in, temp_out, 0); + for (j = 0; j < 32; ++j) + out[j + i * 32] = + (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); + } +} + +// Note that although we use dct_32_round in dct32 computation flow, +// this 2d fdct32x32 for rate-distortion optimization loop is operating +// within 16 bits precision. +void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { + int i, j; + tran_high_t output[32 * 32]; + + // Columns + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) + temp_in[j] = input[j * stride + i] * 4; + vp9_fdct32(temp_in, temp_out, 0); + for (j = 0; j < 32; ++j) + // TODO(cd): see quality impact of only doing + // output[j * 32 + i] = (temp_out[j] + 1) >> 2; + // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c + output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + } + + // Rows + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) + temp_in[j] = output[j + i * 32]; + vp9_fdct32(temp_in, temp_out, 1); + for (j = 0; j < 32; ++j) + out[j + i * 32] = (tran_low_t)temp_out[j]; + } +} + #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { @@ -358,4 +762,13 @@ void vp9_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { vp9_fdct16x16_c(input, output, stride); } + +void vp9_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { + vp9_fdct32x32_c(input, out, stride); +} + +void vp9_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, + int stride) { + vp9_fdct32x32_rd_c(input, out, stride); +} #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/fwd_txfm.h b/vpx_dsp/fwd_txfm.h index 509fe7fe3..729a289d1 100644 --- a/vpx_dsp/fwd_txfm.h +++ b/vpx_dsp/fwd_txfm.h @@ -8,6 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VPX_DSP_FWD_TXFM_H_ +#define VPX_DSP_FWD_TXFM_H_ + #include "vpx_dsp/txfm_common.h" static INLINE tran_high_t fdct_round_shift(tran_high_t input) { @@ -17,3 +20,6 @@ static INLINE tran_high_t fdct_round_shift(tran_high_t input) { // assert(INT16_MIN <= rv && rv <= INT16_MAX); return rv; } + +void vp9_fdct32(const tran_high_t *input, tran_high_t *output, int round); +#endif // VPX_DSP_FWD_TXFM_H_ diff --git a/vpx_dsp/mips/fwd_dct32x32_msa.c b/vpx_dsp/mips/fwd_dct32x32_msa.c new file mode 100644 index 000000000..80573f1c4 --- /dev/null +++ b/vpx_dsp/mips/fwd_dct32x32_msa.c @@ -0,0 +1,933 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_dsp/mips/fwd_txfm_msa.h" + +static void fdct8x32_1d_column_load_butterfly(const int16_t *input, + int32_t src_stride, + int16_t *temp_buff) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 step0, step1, step2, step3; + v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; + v8i16 step0_1, step1_1, step2_1, step3_1; + + /* 1st and 2nd set */ + LD_SH4(input, src_stride, in0, in1, in2, in3); + LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7); + LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); + LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); + SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, + step0, step1, step2, step3, in4, in5, in6, in7); + BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); + ST_SH4(step0, step1, step2, step3, temp_buff, 8); + ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8); + ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8); + ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8); + + /* 3rd and 4th set */ + LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3); + LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7); + LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); + LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); + SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, + step0, step1, step2, step3, in4, in5, in6, in7); + BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); + ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8); + ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8); + ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8); + ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8); +} + +static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 temp0, temp1; + + /* fdct even */ + LD_SH4(input, 8, in0, in1, in2, in3); + LD_SH4(input + 96, 8, in12, in13, in14, in15); + BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, + vec0, vec1, vec2, vec3, in12, in13, in14, in15); + LD_SH4(input + 32, 8, in4, in5, in6, in7); + LD_SH4(input + 64, 8, in8, in9, in10, in11); + BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, + vec4, vec5, vec6, vec7, in8, in9, in10, in11); + + /* Stage 3 */ + ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); + BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); + DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp); + ST_SH(temp1, temp + 512); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 256); + ST_SH(temp1, temp + 768); + + SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 128); + ST_SH(temp1, temp + 896); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 640); + ST_SH(temp1, temp + 384); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 64); + ST_SH(temp1, temp + 960); + + SUB2(in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 576); + ST_SH(temp1, temp + 448); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 320); + ST_SH(temp1, temp + 704); + + ADD2(in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 192); + ST_SH(temp1, temp + 832); +} + +static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) { + v8i16 in16, in17, in18, in19, in20, in21, in22, in23; + v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; + + in20 = LD_SH(input + 32); + in21 = LD_SH(input + 40); + in26 = LD_SH(input + 80); + in27 = LD_SH(input + 88); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + in18 = LD_SH(input + 16); + in19 = LD_SH(input + 24); + in28 = LD_SH(input + 96); + in29 = LD_SH(input + 104); + + vec4 = in19 - in20; + ST_SH(vec4, input + 32); + vec4 = in18 - in21; + ST_SH(vec4, input + 40); + vec4 = in29 - in26; + ST_SH(vec4, input + 80); + vec4 = in28 - in27; + ST_SH(vec4, input + 88); + + in21 = in18 + in21; + in20 = in19 + in20; + in27 = in28 + in27; + in26 = in29 + in26; + + LD_SH4(input + 48, 8, in22, in23, in24, in25); + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + + in16 = LD_SH(input); + in17 = LD_SH(input + 8); + in30 = LD_SH(input + 112); + in31 = LD_SH(input + 120); + + vec4 = in17 - in22; + ST_SH(vec4, input + 16); + vec4 = in16 - in23; + ST_SH(vec4, input + 24); + vec4 = in31 - in24; + ST_SH(vec4, input + 96); + vec4 = in30 - in25; + ST_SH(vec4, input + 104); + + ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + ADD2(in27, in26, in25, in24, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr); + ST_SH(vec4, temp_ptr + 960); + + SUB2(in27, in26, in25, in24, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 448); + ST_SH(vec4, temp_ptr + 512); + + SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); + DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); + SUB2(in26, in27, in24, in25, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec4, temp_ptr + 704); + ST_SH(vec5, temp_ptr + 256); + + ADD2(in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec4, temp_ptr + 192); + ST_SH(vec5, temp_ptr + 768); + + LD_SH4(input + 16, 8, in22, in23, in20, in21); + LD_SH4(input + 80, 8, in26, in27, in24, in25); + in16 = in20; + in17 = in21; + DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); + SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + ADD2(in28, in29, in31, in30, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 832); + ST_SH(vec4, temp_ptr + 128); + + SUB2(in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 320); + ST_SH(vec4, temp_ptr + 640); + ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); + DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); + SUB2(in29, in28, in30, in31, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 576); + ST_SH(vec4, temp_ptr + 384); + + ADD2(in29, in28, in30, in31, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 64); + ST_SH(vec4, temp_ptr + 896); +} + +static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride, + int16_t *tmp_buf, int16_t *tmp_buf_big) { + fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf); + fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big); + fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32)); +} + +static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff, + int16_t *output) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 step0, step1, step2, step3, step4, step5, step6, step7; + + LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, + in8, in9, in10, in11, in12, in13, in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15, + step0, step1, step2, step3, step4, step5, step6, step7, + in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8); + + /* 2nd set */ + LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, + in8, in9, in10, in11, in12, in13, in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15, + step0, step1, step2, step3, step4, step5, step6, step7, + in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, + (output + 8 * 8), 8); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8); +} + +static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, + int16_t *out) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; + v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; + v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w; + + /* fdct32 even */ + /* stage 2 */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15, + vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, + in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8); + + /* Stage 3 */ + UNPCK_SH_SW(vec0, vec0_l, vec0_r); + UNPCK_SH_SW(vec1, vec1_l, vec1_r); + UNPCK_SH_SW(vec2, vec2_l, vec2_r); + UNPCK_SH_SW(vec3, vec3_l, vec3_r); + UNPCK_SH_SW(vec4, vec4_l, vec4_r); + UNPCK_SH_SW(vec5, vec5_l, vec5_r); + UNPCK_SH_SW(vec6, vec6_l, vec6_r); + UNPCK_SH_SW(vec7, vec7_l, vec7_r); + ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, + tmp0_w, tmp1_w, tmp2_w, tmp3_w); + BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r); + ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, + vec0_r, vec1_r, vec2_r, vec3_r); + + tmp3_w = vec0_r + vec3_r; + vec0_r = vec0_r - vec3_r; + vec3_r = vec1_r + vec2_r; + vec1_r = vec1_r - vec2_r; + + DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, + cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r); + FDCT32_POSTPROC_NEG_W(vec4_r); + FDCT32_POSTPROC_NEG_W(tmp3_w); + FDCT32_POSTPROC_NEG_W(vec6_r); + FDCT32_POSTPROC_NEG_W(vec3_r); + PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); + ST_SH2(vec5, vec4, out, 8); + + DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, + cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r); + FDCT32_POSTPROC_NEG_W(vec4_r); + FDCT32_POSTPROC_NEG_W(tmp3_w); + FDCT32_POSTPROC_NEG_W(vec6_r); + FDCT32_POSTPROC_NEG_W(vec3_r); + PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); + ST_SH2(vec5, vec4, out + 16, 8); + + LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); + SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 32); + ST_SH(in5, out + 56); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 40); + ST_SH(in5, out + 48); + + LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 64); + ST_SH(in5, out + 120); + + SUB2(in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 72); + ST_SH(in5, out + 112); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 80); + ST_SH(in5, out + 104); + + ADD2(in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 96); + ST_SH(in5, out + 88); +} + +static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; + + /* fdct32 even */ + /* stage 2 */ + LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15, + vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, + in8, in9, in10, in11, in12, in13, in14, in15); + + /* Stage 3 */ + ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); + BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); + DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out); + ST_SH(temp1, out + 8); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 16); + ST_SH(temp1, out + 24); + + SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 32); + ST_SH(temp1, out + 56); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 40); + ST_SH(temp1, out + 48); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 64); + ST_SH(temp1, out + 120); + + SUB2(in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 72); + ST_SH(temp1, out + 112); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5) + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 80); + ST_SH(temp1, out + 104); + + ADD2(in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 96); + ST_SH(temp1, out + 88); +} + +static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr, + int16_t *out) { + v8i16 in16, in17, in18, in19, in20, in21, in22, in23; + v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; + + in20 = LD_SH(temp + 32); + in21 = LD_SH(temp + 40); + in26 = LD_SH(temp + 80); + in27 = LD_SH(temp + 88); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + in18 = LD_SH(temp + 16); + in19 = LD_SH(temp + 24); + in28 = LD_SH(temp + 96); + in29 = LD_SH(temp + 104); + + vec4 = in19 - in20; + ST_SH(vec4, interm_ptr + 32); + vec4 = in18 - in21; + ST_SH(vec4, interm_ptr + 88); + vec4 = in28 - in27; + ST_SH(vec4, interm_ptr + 56); + vec4 = in29 - in26; + ST_SH(vec4, interm_ptr + 64); + + ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); + + in22 = LD_SH(temp + 48); + in23 = LD_SH(temp + 56); + in24 = LD_SH(temp + 64); + in25 = LD_SH(temp + 72); + + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + + in16 = LD_SH(temp); + in17 = LD_SH(temp + 8); + in30 = LD_SH(temp + 112); + in31 = LD_SH(temp + 120); + + vec4 = in17 - in22; + ST_SH(vec4, interm_ptr + 40); + vec4 = in30 - in25; + ST_SH(vec4, interm_ptr + 48); + vec4 = in31 - in24; + ST_SH(vec4, interm_ptr + 72); + vec4 = in16 - in23; + ST_SH(vec4, interm_ptr + 80); + + ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + + ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + ADD2(in27, in26, in25, in24, in23, in20); + + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out); + ST_SH(vec4, out + 120); + + SUB2(in27, in26, in25, in24, in22, in21); + + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 112); + ST_SH(vec4, out + 8); + + SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); + DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); + SUB2(in26, in27, in24, in25, in23, in20); + + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec4, out + 16); + ST_SH(vec5, out + 104); + + ADD2(in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec4, out + 24); + ST_SH(vec5, out + 96); + + in20 = LD_SH(interm_ptr + 32); + in21 = LD_SH(interm_ptr + 88); + in27 = LD_SH(interm_ptr + 56); + in26 = LD_SH(interm_ptr + 64); + + in16 = in20; + in17 = in21; + DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); + + in22 = LD_SH(interm_ptr + 40); + in25 = LD_SH(interm_ptr + 48); + in24 = LD_SH(interm_ptr + 72); + in23 = LD_SH(interm_ptr + 80); + + SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + ADD2(in28, in29, in31, in30, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 32); + ST_SH(vec4, out + 88); + + SUB2(in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 40); + ST_SH(vec4, out + 80); + + ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); + DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); + SUB2(in29, in28, in30, in31, in16, in19); + + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 72); + ST_SH(vec4, out + 48); + + ADD2(in29, in28, in30, in31, in17, in18); + + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec4, out + 56); + ST_SH(vec5, out + 64); +} + +static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; + + /* 1st set */ + in0 = LD_SH(temp); + in4 = LD_SH(temp + 32); + in2 = LD_SH(temp + 64); + in6 = LD_SH(temp + 96); + in1 = LD_SH(temp + 128); + in7 = LD_SH(temp + 152); + in3 = LD_SH(temp + 192); + in5 = LD_SH(temp + 216); + + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + + /* 2nd set */ + in0_1 = LD_SH(temp + 16); + in1_1 = LD_SH(temp + 232); + in2_1 = LD_SH(temp + 80); + in3_1 = LD_SH(temp + 168); + in4_1 = LD_SH(temp + 48); + in5_1 = LD_SH(temp + 176); + in6_1 = LD_SH(temp + 112); + in7_1 = LD_SH(temp + 240); + + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32); + TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); + + /* 3rd set */ + in0 = LD_SH(temp + 8); + in1 = LD_SH(temp + 136); + in2 = LD_SH(temp + 72); + in3 = LD_SH(temp + 200); + in4 = LD_SH(temp + 40); + in5 = LD_SH(temp + 208); + in6 = LD_SH(temp + 104); + in7 = LD_SH(temp + 144); + + ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + output + 8, 32); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32); + + /* 4th set */ + in0_1 = LD_SH(temp + 24); + in1_1 = LD_SH(temp + 224); + in2_1 = LD_SH(temp + 88); + in3_1 = LD_SH(temp + 160); + in4_1 = LD_SH(temp + 56); + in5_1 = LD_SH(temp + 184); + in6_1 = LD_SH(temp + 120); + in7_1 = LD_SH(temp + 248); + + TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); + ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + output + 24, 32); +} + +static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(temp, temp_buf); + fdct8x32_1d_row_even(temp_buf, temp_buf); + fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128); + fdct8x32_1d_row_transpose_store(temp_buf, output); +} + +static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); + fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf); + fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128); + fdct8x32_1d_row_transpose_store(tmp_buf, output); +} + +void vp9_fdct32x32_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); + + /* column transform */ + for (i = 0; i < 4; ++i) { + fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf, + tmp_buf_big + (8 * i)); + } + + /* row transform */ + fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output); + + /* row transform */ + for (i = 1; i < 4; ++i) { + fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256)); + } +} + +static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; + + /* fdct32 even */ + /* stage 2 */ + LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15, + vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, + in8, in9, in10, in11, in12, in13, in14, in15); + FDCT_POSTPROC_2V_NEG_H(vec0, vec1); + FDCT_POSTPROC_2V_NEG_H(vec2, vec3); + FDCT_POSTPROC_2V_NEG_H(vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec6, vec7); + FDCT_POSTPROC_2V_NEG_H(in8, in9); + FDCT_POSTPROC_2V_NEG_H(in10, in11); + FDCT_POSTPROC_2V_NEG_H(in12, in13); + FDCT_POSTPROC_2V_NEG_H(in14, in15); + + /* Stage 3 */ + ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); + + temp0 = in0 + in3; + in0 = in0 - in3; + in3 = in1 + in2; + in1 = in1 - in2; + + DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0); + ST_SH(temp0, out); + ST_SH(temp1, out + 8); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + ST_SH(temp0, out + 16); + ST_SH(temp1, out + 24); + + SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + ST_SH(temp0, out + 32); + ST_SH(temp1, out + 56); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + ST_SH(temp0, out + 40); + ST_SH(temp1, out + 48); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + ST_SH(temp0, out + 64); + ST_SH(temp1, out + 120); + + SUB2(in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + ST_SH(temp0, out + 72); + ST_SH(temp1, out + 112); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + ST_SH(temp0, out + 80); + ST_SH(temp1, out + 104); + + ADD2(in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + ST_SH(temp0, out + 96); + ST_SH(temp1, out + 88); +} + +static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr, + int16_t *out) { + v8i16 in16, in17, in18, in19, in20, in21, in22, in23; + v8i16 in24, in25, in26, in27, in28, in29, in30, in31; + v8i16 vec4, vec5; + + in20 = LD_SH(temp + 32); + in21 = LD_SH(temp + 40); + in26 = LD_SH(temp + 80); + in27 = LD_SH(temp + 88); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + FDCT_POSTPROC_2V_NEG_H(in20, in21); + FDCT_POSTPROC_2V_NEG_H(in26, in27); + + in18 = LD_SH(temp + 16); + in19 = LD_SH(temp + 24); + in28 = LD_SH(temp + 96); + in29 = LD_SH(temp + 104); + + FDCT_POSTPROC_2V_NEG_H(in18, in19); + FDCT_POSTPROC_2V_NEG_H(in28, in29); + + vec4 = in19 - in20; + ST_SH(vec4, interm_ptr + 32); + vec4 = in18 - in21; + ST_SH(vec4, interm_ptr + 88); + vec4 = in29 - in26; + ST_SH(vec4, interm_ptr + 64); + vec4 = in28 - in27; + ST_SH(vec4, interm_ptr + 56); + + ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); + + in22 = LD_SH(temp + 48); + in23 = LD_SH(temp + 56); + in24 = LD_SH(temp + 64); + in25 = LD_SH(temp + 72); + + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + FDCT_POSTPROC_2V_NEG_H(in22, in23); + FDCT_POSTPROC_2V_NEG_H(in24, in25); + + in16 = LD_SH(temp); + in17 = LD_SH(temp + 8); + in30 = LD_SH(temp + 112); + in31 = LD_SH(temp + 120); + + FDCT_POSTPROC_2V_NEG_H(in16, in17); + FDCT_POSTPROC_2V_NEG_H(in30, in31); + + vec4 = in17 - in22; + ST_SH(vec4, interm_ptr + 40); + vec4 = in30 - in25; + ST_SH(vec4, interm_ptr + 48); + vec4 = in31 - in24; + ST_SH(vec4, interm_ptr + 72); + vec4 = in16 - in23; + ST_SH(vec4, interm_ptr + 80); + + ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + ADD2(in27, in26, in25, in24, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + ST_SH(vec5, out); + ST_SH(vec4, out + 120); + + SUB2(in27, in26, in25, in24, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + ST_SH(vec5, out + 112); + ST_SH(vec4, out + 8); + + SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); + DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); + SUB2(in26, in27, in24, in25, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + ST_SH(vec4, out + 16); + ST_SH(vec5, out + 104); + + ADD2(in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + ST_SH(vec4, out + 24); + ST_SH(vec5, out + 96); + + in20 = LD_SH(interm_ptr + 32); + in21 = LD_SH(interm_ptr + 88); + in27 = LD_SH(interm_ptr + 56); + in26 = LD_SH(interm_ptr + 64); + + in16 = in20; + in17 = in21; + DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); + + in22 = LD_SH(interm_ptr + 40); + in25 = LD_SH(interm_ptr + 48); + in24 = LD_SH(interm_ptr + 72); + in23 = LD_SH(interm_ptr + 80); + + SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + in16 = in28 + in29; + in19 = in31 + in30; + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + ST_SH(vec5, out + 32); + ST_SH(vec4, out + 88); + + SUB2(in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + ST_SH(vec5, out + 40); + ST_SH(vec4, out + 80); + + ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); + DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); + SUB2(in29, in28, in30, in31, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + ST_SH(vec5, out + 72); + ST_SH(vec4, out + 48); + + ADD2(in29, in28, in30, in31, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + ST_SH(vec4, out + 56); + ST_SH(vec5, out + 64); +} + +static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); + fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf); + fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128)); + fdct8x32_1d_row_transpose_store(tmp_buf, output); +} + +void vp9_fdct32x32_rd_msa(const int16_t *input, int16_t *out, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); + + /* column transform */ + for (i = 0; i < 4; ++i) { + fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0], + &tmp_buf_big[0] + (8 * i)); + } + + /* row transform */ + for (i = 0; i < 4; ++i) { + fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0], + out + (8 * i * 32)); + } +} diff --git a/vpx_dsp/mips/fwd_txfm_msa.h b/vpx_dsp/mips/fwd_txfm_msa.h index d4c68ec2c..ca307a074 100644 --- a/vpx_dsp/mips/fwd_txfm_msa.h +++ b/vpx_dsp/mips/fwd_txfm_msa.h @@ -273,6 +273,85 @@ out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ } +#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) { \ + v8i16 tp0_m, tp1_m; \ + v8i16 one_m = __msa_ldi_h(1); \ + \ + tp0_m = __msa_clti_s_h(vec0, 0); \ + tp1_m = __msa_clti_s_h(vec1, 0); \ + vec0 += 1; \ + vec1 += 1; \ + tp0_m = one_m & tp0_m; \ + tp1_m = one_m & tp1_m; \ + vec0 += tp0_m; \ + vec1 += tp1_m; \ + vec0 >>= 2; \ + vec1 >>= 2; \ +} + +#define FDCT32_POSTPROC_NEG_W(vec) { \ + v4i32 temp_m; \ + v4i32 one_m = __msa_ldi_w(1); \ + \ + temp_m = __msa_clti_s_w(vec, 0); \ + vec += 1; \ + temp_m = one_m & temp_m; \ + vec += temp_m; \ + vec >>= 2; \ +} + +#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) { \ + v8i16 tp0_m, tp1_m; \ + v8i16 one = __msa_ldi_h(1); \ + \ + tp0_m = __msa_clei_s_h(vec0, 0); \ + tp1_m = __msa_clei_s_h(vec1, 0); \ + tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \ + tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \ + vec0 += 1; \ + vec1 += 1; \ + tp0_m = one & tp0_m; \ + tp1_m = one & tp1_m; \ + vec0 += tp0_m; \ + vec1 += tp1_m; \ + vec0 >>= 2; \ + vec1 >>= 2; \ +} + +#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, \ + reg1_right, const0, const1, \ + out0, out1, out2, out3) { \ + v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \ + v4i32 k0_m = __msa_fill_w((int32_t) const0); \ + \ + s0_m = __msa_fill_w((int32_t) const1); \ + k0_m = __msa_ilvev_w(s0_m, k0_m); \ + \ + ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \ + ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m); \ + ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m); \ + ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m); \ + \ + DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m); \ + DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m); \ + tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ + tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ + tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ + tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ + out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ + out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ + \ + DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m); \ + DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m); \ + tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ + tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ + tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ + tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ + out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ + out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ +} + void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, int32_t src_stride); void fdct16x8_1d_row(int16_t *input, int16_t *output); diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index e3bf5a2c5..30732dd92 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -71,14 +71,18 @@ DSP_SRCS-yes += fwd_txfm.h DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.h DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_impl_sse2.h +DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32x32_impl_sse2.h ifeq ($(ARCH_X86_64),yes) ifeq ($(CONFIG_USE_X86INC),yes) DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm endif endif +DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/fwd_dct32x32_msa.c endif # CONFIG_VP9_ENCODER # quantization diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 6dc44234a..02790b0dc 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -138,6 +138,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fdct16x16 sse2/; + add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct32x32 sse2/; + + add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct32x32_rd sse2/; + add_proto qw/void vp9_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_highbd_fdct4x4 sse2/; @@ -146,6 +152,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_highbd_fdct16x16 sse2/; + + add_proto qw/void vp9_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_highbd_fdct32x32 sse2/; + + add_proto qw/void vp9_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_highbd_fdct32x32_rd sse2/; } else { add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fdct4x4 sse2 msa/; @@ -155,6 +167,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fdct16x16 sse2 msa/; + + add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct32x32 sse2 avx2 msa/; + + add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct32x32_rd sse2 avx2 msa/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER diff --git a/vp9/encoder/x86/vp9_dct32x32_avx2_impl.h b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h similarity index 99% rename from vp9/encoder/x86/vp9_dct32x32_avx2_impl.h rename to vpx_dsp/x86/fwd_dct32x32_impl_avx2.h index ba5e0597a..4df39dff8 100644 --- a/vp9/encoder/x86/vp9_dct32x32_avx2_impl.h +++ b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h @@ -10,9 +10,7 @@ #include // AVX2 -#include "./vp9_rtcd.h" #include "vpx_dsp/txfm_common.h" -#include "vpx_ports/mem.h" #define pair256_set_epi16(a, b) \ _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h b/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h similarity index 99% rename from vp9/encoder/x86/vp9_dct32x32_sse2_impl.h rename to vpx_dsp/x86/fwd_dct32x32_impl_sse2.h index f0707eaa0..e0d272d74 100644 --- a/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h +++ b/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h @@ -10,11 +10,13 @@ #include // SSE2 -#include "vp9/encoder/vp9_dct.h" +#include "vpx_dsp/fwd_txfm.h" #include "vpx_dsp/txfm_common.h" #include "vpx_dsp/x86/txfm_common_sse2.h" -#include "vpx_ports/mem.h" +// TODO(jingning) The high bit-depth version needs re-work for performance. +// The current SSE2 implementation also causes cross reference to the static +// functions in the C implementation file. #if DCT_HIGH_BIT_DEPTH #define ADD_EPI16 _mm_adds_epi16 #define SUB_EPI16 _mm_subs_epi16 diff --git a/vp9/encoder/x86/vp9_dct_avx2.c b/vpx_dsp/x86/fwd_txfm_avx2.c similarity index 75% rename from vp9/encoder/x86/vp9_dct_avx2.c rename to vpx_dsp/x86/fwd_txfm_avx2.c index 8f3b61ad8..c1d4f40ea 100644 --- a/vp9/encoder/x86/vp9_dct_avx2.c +++ b/vpx_dsp/x86/fwd_txfm_avx2.c @@ -8,19 +8,16 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include // AVX2 -#include "vp9/common/vp9_idct.h" // for cospi constants -#include "vpx_ports/mem.h" - +#include "./vpx_config.h" #define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2 #define FDCT32x32_HIGH_PRECISION 0 -#include "vp9/encoder/x86/vp9_dct32x32_avx2_impl.h" +#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h" #undef FDCT32x32_2D_AVX2 #undef FDCT32x32_HIGH_PRECISION #define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2 #define FDCT32x32_HIGH_PRECISION 1 -#include "vp9/encoder/x86/vp9_dct32x32_avx2_impl.h" // NOLINT +#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h" // NOLINT #undef FDCT32x32_2D_AVX2 #undef FDCT32x32_HIGH_PRECISION diff --git a/vpx_dsp/x86/fwd_txfm_sse2.c b/vpx_dsp/x86/fwd_txfm_sse2.c index a868d2230..37beeec73 100644 --- a/vpx_dsp/x86/fwd_txfm_sse2.c +++ b/vpx_dsp/x86/fwd_txfm_sse2.c @@ -11,7 +11,6 @@ #include "./vpx_config.h" #define DCT_HIGH_BIT_DEPTH 0 - #define FDCT4x4_2D vp9_fdct4x4_sse2 #define FDCT8x8_2D vp9_fdct8x8_sse2 #define FDCT16x16_2D vp9_fdct16x16_sse2 @@ -19,6 +18,18 @@ #undef FDCT4x4_2D #undef FDCT8x8_2D #undef FDCT16x16_2D + +#define FDCT32x32_2D vp9_fdct32x32_rd_sse2 +#define FDCT32x32_HIGH_PRECISION 0 +#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION + +#define FDCT32x32_2D vp9_fdct32x32_sse2 +#define FDCT32x32_HIGH_PRECISION 1 +#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION #undef DCT_HIGH_BIT_DEPTH #if CONFIG_VP9_HIGHBITDEPTH @@ -30,5 +41,17 @@ #undef FDCT4x4_2D #undef FDCT8x8_2D #undef FDCT16x16_2D + +#define FDCT32x32_2D vp9_highbd_fdct32x32_rd_sse2 +#define FDCT32x32_HIGH_PRECISION 0 +#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION + +#define FDCT32x32_2D vp9_highbd_fdct32x32_sse2 +#define FDCT32x32_HIGH_PRECISION 1 +#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION #undef DCT_HIGH_BIT_DEPTH #endif // CONFIG_VP9_HIGHBITDEPTH