From d19033fa4e46a2a97adcf752ccebe79bc86662a9 Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Tue, 28 Jul 2015 14:42:25 -0700 Subject: [PATCH] Move DC only forward 2D-DCT functions to vpx_dsp This completes the forward transform functions layout refactoring. Change-Id: I996fb0fb795f41e2040f7b21db985774098aedbd --- vp9/common/vp9_rtcd_defs.pl | 33 ---- vp9/encoder/arm/neon/vp9_dct_neon.c | 20 +-- vp9/encoder/mips/msa/vp9_fdct16x16_msa.c | 12 +- vp9/encoder/mips/msa/vp9_fdct32x32_msa.c | 33 ---- vp9/encoder/mips/msa/vp9_fdct4x4_msa.c | 2 +- vp9/encoder/mips/msa/vp9_fdct8x8_msa.c | 7 +- vp9/encoder/mips/msa/vp9_fdct_msa.h | 15 -- vp9/encoder/vp9_dct.c | 48 ----- vp9/encoder/x86/vp9_dct_sse2.c | 210 ---------------------- vp9/vp9cx.mk | 1 - vpx_dsp/arm/fwd_txfm_neon.c | 17 ++ vpx_dsp/fwd_txfm.c | 70 ++++++-- vpx_dsp/mips/fwd_dct32x32_msa.c | 22 +++ vpx_dsp/mips/fwd_txfm_msa.c | 15 ++ vpx_dsp/mips/fwd_txfm_msa.h | 15 ++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 33 ++++ vpx_dsp/x86/fwd_txfm_sse2.c | 214 +++++++++++++++++++++++ 17 files changed, 380 insertions(+), 387 deletions(-) delete mode 100644 vp9/encoder/mips/msa/vp9_fdct32x32_msa.c diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index a713f7feb..c652c0e0a 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -825,18 +825,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fwht4x4/, "$mmx_x86inc"; - - add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct4x4_1 sse2/; - - add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct8x8_1 sse2/; - - add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct16x16_1 sse2/; - - add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct32x32_1 sse2/; } else { add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; specialize qw/vp9_fht4x4 sse2 msa/; @@ -849,18 +837,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fwht4x4 msa/, "$mmx_x86inc"; - - add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct4x4_1 sse2/; - - add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct8x8_1 sse2 neon msa/; - - add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct16x16_1 sse2 msa/; - - add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct32x32_1 sse2 msa/; } # @@ -914,15 +890,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_highbd_fwht4x4/; - add_proto qw/void vp9_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_highbd_fdct8x8_1/; - - add_proto qw/void vp9_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_highbd_fdct16x16_1/; - - add_proto qw/void vp9_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_highbd_fdct32x32_1/; - add_proto qw/void vp9_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; specialize qw/vp9_highbd_temporal_filter_apply/; diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c index 941ad2000..15dc132eb 100644 --- a/vp9/encoder/arm/neon/vp9_dct_neon.c +++ b/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -9,30 +9,14 @@ */ #include + #include "./vp9_rtcd.h" -#include "./vpx_dsp_rtcd.h" #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vp9/common/vp9_blockd.h" #include "vpx_dsp/txfm_common.h" -void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) { - int r; - int16x8_t sum = vld1q_s16(&input[0]); - for (r = 1; r < 8; ++r) { - const int16x8_t input_00 = vld1q_s16(&input[r * stride]); - sum = vaddq_s16(sum, input_00); - } - { - const int32x4_t a = vpaddlq_s16(sum); - const int64x2_t b = vpaddlq_s32(a); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0); - output[1] = 0; - } -} - void vp9_fdct8x8_quant_neon(const int16_t *input, int stride, int16_t* coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t* zbin_ptr, diff --git a/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c b/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c index de0295672..6dabb5890 100644 --- a/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c +++ b/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c @@ -10,20 +10,10 @@ #include -#include "./vp9_rtcd.h" +#include "vp9/common/vp9_enums.h" #include "vp9/encoder/mips/msa/vp9_fdct_msa.h" #include "vpx_dsp/mips/fwd_txfm_msa.h" -void vp9_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) { - out[1] = 0; - - out[0] = LD_HADD(input, stride); - out[0] += LD_HADD(input + 8, stride); - out[0] += LD_HADD(input + 16 * 8, stride); - out[0] += LD_HADD(input + 16 * 8 + 8, stride); - out[0] >>= 1; -} - static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride, const int32_t *const0, int16_t *int_buf) { v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; diff --git a/vp9/encoder/mips/msa/vp9_fdct32x32_msa.c b/vp9/encoder/mips/msa/vp9_fdct32x32_msa.c deleted file mode 100644 index 81f2c3a48..000000000 --- a/vp9/encoder/mips/msa/vp9_fdct32x32_msa.c +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vp9/encoder/mips/msa/vp9_fdct_msa.h" - -void vp9_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) { - out[1] = 0; - - out[0] = LD_HADD(input, stride); - out[0] += LD_HADD(input + 8, stride); - out[0] += LD_HADD(input + 16, stride); - out[0] += LD_HADD(input + 24, stride); - out[0] += LD_HADD(input + 32 * 8, stride); - out[0] += LD_HADD(input + 32 * 8 + 8, stride); - out[0] += LD_HADD(input + 32 * 8 + 16, stride); - out[0] += LD_HADD(input + 32 * 8 + 24, stride); - out[0] += LD_HADD(input + 32 * 16, stride); - out[0] += LD_HADD(input + 32 * 16 + 8, stride); - out[0] += LD_HADD(input + 32 * 16 + 16, stride); - out[0] += LD_HADD(input + 32 * 16 + 24, stride); - out[0] += LD_HADD(input + 32 * 24, stride); - out[0] += LD_HADD(input + 32 * 24 + 8, stride); - out[0] += LD_HADD(input + 32 * 24 + 16, stride); - out[0] += LD_HADD(input + 32 * 24 + 24, stride); - out[0] >>= 3; -} diff --git a/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c b/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c index ce47457fa..574016f15 100644 --- a/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c +++ b/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c @@ -10,7 +10,7 @@ #include -#include "./vp9_rtcd.h" +#include "vp9/common/vp9_enums.h" #include "vp9/encoder/mips/msa/vp9_fdct_msa.h" void vp9_fwht4x4_msa(const int16_t *input, int16_t *output, diff --git a/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c b/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c index 253b5e846..7c3c635f8 100644 --- a/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c +++ b/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c @@ -10,14 +10,9 @@ #include -#include "./vp9_rtcd.h" +#include "vp9/common/vp9_enums.h" #include "vp9/encoder/mips/msa/vp9_fdct_msa.h" -void vp9_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) { - out[0] = LD_HADD(input, stride); - out[1] = 0; -} - void vp9_fht8x8_msa(const int16_t *input, int16_t *output, int32_t stride, int32_t tx_type) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; diff --git a/vp9/encoder/mips/msa/vp9_fdct_msa.h b/vp9/encoder/mips/msa/vp9_fdct_msa.h index 504d36154..d7d40cb72 100644 --- a/vp9/encoder/mips/msa/vp9_fdct_msa.h +++ b/vp9/encoder/mips/msa/vp9_fdct_msa.h @@ -81,21 +81,6 @@ out5 = -out5; \ } -#define LD_HADD(psrc, stride) ({ \ - v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \ - v4i32 vec_w_m; \ - \ - LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \ - ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \ - LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \ - ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, \ - in4_m, in6_m, in0_m, in4_m); \ - in0_m += in4_m; \ - \ - vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \ - HADD_SW_S32(vec_w_m); \ -}) - #define VP9_FADST4(in0, in1, in2, in3, out0, out1, out2, out3) { \ v4i32 s0_m, s1_m, s2_m, s3_m, constant_m; \ v4i32 in0_r_m, in1_r_m, in2_r_m, in3_r_m; \ diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index 9f1c74015..09b2bbbf2 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -529,17 +529,6 @@ static const transform_2d FHT_16[] = { { fadst16, fadst16 } // ADST_ADST = 3 }; -void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) { - int r, c; - tran_low_t sum = 0; - for (r = 0; r < 4; ++r) - for (c = 0; c < 4; ++c) - sum += input[r * stride + c]; - - output[0] = sum << 1; - output[1] = 0; -} - void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type) { if (tx_type == DCT_DCT) { @@ -572,17 +561,6 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, } } -void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) { - int r, c; - tran_low_t sum = 0; - for (r = 0; r < 8; ++r) - for (c = 0; c < 8; ++c) - sum += input[r * stride + c]; - - output[0] = sum; - output[1] = 0; -} - void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, @@ -695,17 +673,6 @@ void vp9_fdct8x8_quant_c(const int16_t *input, int stride, *eob_ptr = eob + 1; } -void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) { - int r, c; - tran_low_t sum = 0; - for (r = 0; r < 16; ++r) - for (c = 0; c < 16; ++c) - sum += input[r * stride + c]; - - output[0] = sum >> 1; - output[1] = 0; -} - void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type) { if (tx_type == DCT_DCT) { @@ -828,16 +795,6 @@ void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, vp9_fht4x4_c(input, output, stride, tx_type); } -void vp9_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output, - int stride) { - vp9_fdct8x8_1_c(input, final_output, stride); -} - -void vp9_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, - int stride) { - vp9_fdct16x16_1_c(input, output, stride); -} - void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type) { vp9_fht8x8_c(input, output, stride, tx_type); @@ -852,9 +809,4 @@ void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type) { vp9_fht16x16_c(input, output, stride, tx_type); } - -void vp9_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out, - int stride) { - vp9_fdct32x32_1_c(input, out, stride); -} #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c index b39346080..9de82872f 100644 --- a/vp9/encoder/x86/vp9_dct_sse2.c +++ b/vp9/encoder/x86/vp9_dct_sse2.c @@ -18,35 +18,6 @@ #include "vpx_dsp/x86/txfm_common_sse2.h" #include "vpx_ports/mem.h" -void vp9_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { - __m128i in0, in1; - __m128i tmp; - const __m128i zero = _mm_setzero_si128(); - in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *) - (input + 2 * stride))); - in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *) - (input + 3 * stride))); - - tmp = _mm_add_epi16(in0, in1); - in0 = _mm_unpacklo_epi16(zero, tmp); - in1 = _mm_unpackhi_epi16(zero, tmp); - in0 = _mm_srai_epi32(in0, 16); - in1 = _mm_srai_epi32(in1, 16); - - tmp = _mm_add_epi32(in0, in1); - in0 = _mm_unpacklo_epi32(tmp, zero); - in1 = _mm_unpackhi_epi32(tmp, zero); - - tmp = _mm_add_epi32(in0, in1); - in0 = _mm_srli_si128(tmp, 8); - - in1 = _mm_add_epi32(tmp, in0); - in0 = _mm_slli_epi32(in1, 1); - store_output(&in0, output); -} - static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, int stride) { const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); @@ -212,46 +183,6 @@ void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, } } -void vp9_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { - __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); - __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); - __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); - __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); - __m128i u0, u1, sum; - - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - - in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); - in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); - in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); - in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); - - sum = _mm_add_epi16(u0, u1); - - in0 = _mm_add_epi16(in0, in1); - in2 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, in0); - - u0 = _mm_setzero_si128(); - sum = _mm_add_epi16(sum, in2); - - in0 = _mm_unpacklo_epi16(u0, sum); - in1 = _mm_unpackhi_epi16(u0, sum); - in0 = _mm_srai_epi32(in0, 16); - in1 = _mm_srai_epi32(in1, 16); - - sum = _mm_add_epi32(in0, in1); - in0 = _mm_unpacklo_epi32(sum, u0); - in1 = _mm_unpackhi_epi32(sum, u0); - - sum = _mm_add_epi32(in0, in1); - in0 = _mm_srli_si128(sum, 8); - - in1 = _mm_add_epi32(sum, in0); - store_output(&in1, output); -} - void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, int16_t* coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t* zbin_ptr, @@ -1239,75 +1170,6 @@ void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, } } -void vp9_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, - int stride) { - __m128i in0, in1, in2, in3; - __m128i u0, u1; - __m128i sum = _mm_setzero_si128(); - int i; - - for (i = 0; i < 2; ++i) { - input += 8 * i; - in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); - in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); - in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); - in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); - - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); - in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); - in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); - in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); - - sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - in0 = _mm_load_si128((const __m128i *)(input + 8 * stride)); - in1 = _mm_load_si128((const __m128i *)(input + 9 * stride)); - in2 = _mm_load_si128((const __m128i *)(input + 10 * stride)); - in3 = _mm_load_si128((const __m128i *)(input + 11 * stride)); - - sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - in0 = _mm_load_si128((const __m128i *)(input + 12 * stride)); - in1 = _mm_load_si128((const __m128i *)(input + 13 * stride)); - in2 = _mm_load_si128((const __m128i *)(input + 14 * stride)); - in3 = _mm_load_si128((const __m128i *)(input + 15 * stride)); - - sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - sum = _mm_add_epi16(sum, u1); - } - - u0 = _mm_setzero_si128(); - in0 = _mm_unpacklo_epi16(u0, sum); - in1 = _mm_unpackhi_epi16(u0, sum); - in0 = _mm_srai_epi32(in0, 16); - in1 = _mm_srai_epi32(in1, 16); - - sum = _mm_add_epi32(in0, in1); - in0 = _mm_unpacklo_epi32(sum, u0); - in1 = _mm_unpackhi_epi32(sum, u0); - - sum = _mm_add_epi32(in0, in1); - in0 = _mm_srli_si128(sum, 8); - - in1 = _mm_add_epi32(sum, in0); - in1 = _mm_srai_epi32(in1, 1); - store_output(&in1, output); -} - static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0, __m128i *in1, int stride) { // load first 8 columns @@ -2194,75 +2056,3 @@ void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, break; } } - -void vp9_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, - int stride) { - __m128i in0, in1, in2, in3; - __m128i u0, u1; - __m128i sum = _mm_setzero_si128(); - int i; - - for (i = 0; i < 8; ++i) { - in0 = _mm_load_si128((const __m128i *)(input + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 16)); - in3 = _mm_load_si128((const __m128i *)(input + 24)); - - input += stride; - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - in0 = _mm_load_si128((const __m128i *)(input + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 16)); - in3 = _mm_load_si128((const __m128i *)(input + 24)); - - input += stride; - sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - in0 = _mm_load_si128((const __m128i *)(input + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 16)); - in3 = _mm_load_si128((const __m128i *)(input + 24)); - - input += stride; - sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - in0 = _mm_load_si128((const __m128i *)(input + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 16)); - in3 = _mm_load_si128((const __m128i *)(input + 24)); - - input += stride; - sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - sum = _mm_add_epi16(sum, u1); - } - - u0 = _mm_setzero_si128(); - in0 = _mm_unpacklo_epi16(u0, sum); - in1 = _mm_unpackhi_epi16(u0, sum); - in0 = _mm_srai_epi32(in0, 16); - in1 = _mm_srai_epi32(in1, 16); - - sum = _mm_add_epi32(in0, in1); - in0 = _mm_unpacklo_epi32(sum, u0); - in1 = _mm_unpackhi_epi32(sum, u0); - - sum = _mm_add_epi32(in0, in1); - in0 = _mm_srli_si128(sum, 8); - - in1 = _mm_add_epi32(sum, in0); - in1 = _mm_srai_epi32(in1, 3); - store_output(&in1, output); -} diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index c9278d2a6..2b5d0a70d 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -136,7 +136,6 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c -VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c diff --git a/vpx_dsp/arm/fwd_txfm_neon.c b/vpx_dsp/arm/fwd_txfm_neon.c index 6258b610a..406b10d4c 100644 --- a/vpx_dsp/arm/fwd_txfm_neon.c +++ b/vpx_dsp/arm/fwd_txfm_neon.c @@ -201,3 +201,20 @@ void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { vst1q_s16(&final_output[7 * 8], input_7); } } + +void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) { + int r; + int16x8_t sum = vld1q_s16(&input[0]); + for (r = 1; r < 8; ++r) { + const int16x8_t input_00 = vld1q_s16(&input[r * stride]); + sum = vaddq_s16(sum, input_00); + } + { + const int32x4_t a = vpaddlq_s16(sum); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0); + output[1] = 0; + } +} diff --git a/vpx_dsp/fwd_txfm.c b/vpx_dsp/fwd_txfm.c index 337b82e03..558ca9a53 100644 --- a/vpx_dsp/fwd_txfm.c +++ b/vpx_dsp/fwd_txfm.c @@ -77,6 +77,17 @@ void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { } } +void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) { + int r, c; + tran_low_t sum = 0; + for (r = 0; r < 4; ++r) + for (c = 0; c < 4; ++c) + sum += input[r * stride + c]; + + output[0] = sum << 1; + output[1] = 0; +} + void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { int i, j; tran_low_t intermediate[64]; @@ -163,6 +174,17 @@ void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { } } +void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) { + int r, c; + tran_low_t sum = 0; + for (r = 0; r < 8; ++r) + for (c = 0; c < 8; ++c) + sum += input[r * stride + c]; + + output[0] = sum; + output[1] = 0; +} + void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose @@ -343,6 +365,17 @@ void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { } } +void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) { + int r, c; + tran_low_t sum = 0; + for (r = 0; r < 16; ++r) + for (c = 0; c < 16; ++c) + sum += input[r * stride + c]; + + output[0] = sum >> 1; + output[1] = 0; +} + static INLINE tran_high_t dct_32_round(tran_high_t input) { tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); // TODO(debargha, peter.derivaz): Find new bounds for this assert, @@ -679,17 +712,6 @@ void vp9_fdct32(const tran_high_t *input, tran_high_t *output, int round) { output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); } -void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) { - int r, c; - tran_low_t sum = 0; - for (r = 0; r < 32; ++r) - for (c = 0; c < 32; ++c) - sum += input[r * stride + c]; - - output[0] = sum >> 3; - output[1] = 0; -} - void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { int i, j; tran_high_t output[32 * 32]; @@ -747,6 +769,17 @@ void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { } } +void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) { + int r, c; + tran_low_t sum = 0; + for (r = 0; r < 32; ++r) + for (c = 0; c < 32; ++c) + sum += input[r * stride + c]; + + output[0] = sum >> 3; + output[1] = 0; +} + #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { @@ -758,11 +791,21 @@ void vp9_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output, vp9_fdct8x8_c(input, final_output, stride); } +void vp9_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output, + int stride) { + vp9_fdct8x8_1_c(input, final_output, stride); +} + void vp9_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { vp9_fdct16x16_c(input, output, stride); } +void vp9_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, + int stride) { + vp9_fdct16x16_1_c(input, output, stride); +} + void vp9_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { vp9_fdct32x32_c(input, out, stride); } @@ -771,4 +814,9 @@ void vp9_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { vp9_fdct32x32_rd_c(input, out, stride); } + +void vp9_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out, + int stride) { + vp9_fdct32x32_1_c(input, out, stride); +} #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/mips/fwd_dct32x32_msa.c b/vpx_dsp/mips/fwd_dct32x32_msa.c index 80573f1c4..0219571c7 100644 --- a/vpx_dsp/mips/fwd_dct32x32_msa.c +++ b/vpx_dsp/mips/fwd_dct32x32_msa.c @@ -931,3 +931,25 @@ void vp9_fdct32x32_rd_msa(const int16_t *input, int16_t *out, out + (8 * i * 32)); } } + +void vp9_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) { + out[1] = 0; + + out[0] = LD_HADD(input, stride); + out[0] += LD_HADD(input + 8, stride); + out[0] += LD_HADD(input + 16, stride); + out[0] += LD_HADD(input + 24, stride); + out[0] += LD_HADD(input + 32 * 8, stride); + out[0] += LD_HADD(input + 32 * 8 + 8, stride); + out[0] += LD_HADD(input + 32 * 8 + 16, stride); + out[0] += LD_HADD(input + 32 * 8 + 24, stride); + out[0] += LD_HADD(input + 32 * 16, stride); + out[0] += LD_HADD(input + 32 * 16 + 8, stride); + out[0] += LD_HADD(input + 32 * 16 + 16, stride); + out[0] += LD_HADD(input + 32 * 16 + 24, stride); + out[0] += LD_HADD(input + 32 * 24, stride); + out[0] += LD_HADD(input + 32 * 24 + 8, stride); + out[0] += LD_HADD(input + 32 * 24 + 16, stride); + out[0] += LD_HADD(input + 32 * 24 + 24, stride); + out[0] >>= 3; +} diff --git a/vpx_dsp/mips/fwd_txfm_msa.c b/vpx_dsp/mips/fwd_txfm_msa.c index 8a7e7b6a9..1e35542f7 100644 --- a/vpx_dsp/mips/fwd_txfm_msa.c +++ b/vpx_dsp/mips/fwd_txfm_msa.c @@ -215,6 +215,11 @@ void vp9_fdct8x8_msa(const int16_t *input, int16_t *output, ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); } +void vp9_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) { + out[0] = LD_HADD(input, stride); + out[1] = 0; +} + void vp9_fdct16x16_msa(const int16_t *input, int16_t *output, int32_t src_stride) { int32_t i; @@ -230,3 +235,13 @@ void vp9_fdct16x16_msa(const int16_t *input, int16_t *output, fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i))); } } + +void vp9_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) { + out[1] = 0; + + out[0] = LD_HADD(input, stride); + out[0] += LD_HADD(input + 8, stride); + out[0] += LD_HADD(input + 16 * 8, stride); + out[0] += LD_HADD(input + 16 * 8 + 8, stride); + out[0] >>= 1; +} diff --git a/vpx_dsp/mips/fwd_txfm_msa.h b/vpx_dsp/mips/fwd_txfm_msa.h index ca307a074..d1e160eed 100644 --- a/vpx_dsp/mips/fwd_txfm_msa.h +++ b/vpx_dsp/mips/fwd_txfm_msa.h @@ -14,6 +14,21 @@ #include "vpx_dsp/mips/txfm_macros_msa.h" #include "vpx_dsp/txfm_common.h" +#define LD_HADD(psrc, stride) ({ \ + v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \ + v4i32 vec_w_m; \ + \ + LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \ + ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \ + LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \ + ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, \ + in4_m, in6_m, in0_m, in4_m); \ + in0_m += in4_m; \ + \ + vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \ + HADD_SW_S32(vec_w_m); \ +}) + #define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) { \ v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \ diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 02790b0dc..4d6843bbd 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -132,47 +132,80 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fdct4x4 sse2/; + add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct4x4_1 sse2/; + add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fdct8x8 sse2/; + add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct8x8_1 sse2/; + add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fdct16x16 sse2/; + add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct16x16_1 sse2/; + add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fdct32x32 sse2/; add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fdct32x32_rd sse2/; + add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct32x32_1 sse2/; + add_proto qw/void vp9_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_highbd_fdct4x4 sse2/; add_proto qw/void vp9_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_highbd_fdct8x8 sse2/; + add_proto qw/void vp9_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_highbd_fdct8x8_1/; + add_proto qw/void vp9_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_highbd_fdct16x16 sse2/; + add_proto qw/void vp9_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_highbd_fdct16x16_1/; + add_proto qw/void vp9_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_highbd_fdct32x32 sse2/; add_proto qw/void vp9_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_highbd_fdct32x32_rd sse2/; + + add_proto qw/void vp9_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_highbd_fdct32x32_1/; } else { add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fdct4x4 sse2 msa/; + add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct4x4_1 sse2/; + add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fdct8x8 sse2 neon msa/, "$ssse3_x86_64_x86inc"; + add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct8x8_1 sse2 neon msa/; + add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fdct16x16 sse2 msa/; + add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct16x16_1 sse2 msa/; + add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fdct32x32 sse2 avx2 msa/; add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fdct32x32_rd sse2 avx2 msa/; + + add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp9_fdct32x32_1 sse2 msa/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER diff --git a/vpx_dsp/x86/fwd_txfm_sse2.c b/vpx_dsp/x86/fwd_txfm_sse2.c index 37beeec73..2704e6839 100644 --- a/vpx_dsp/x86/fwd_txfm_sse2.c +++ b/vpx_dsp/x86/fwd_txfm_sse2.c @@ -8,7 +8,221 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include // SSE2 + #include "./vpx_config.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/fwd_txfm_sse2.h" + +void vp9_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { + __m128i in0, in1; + __m128i tmp; + const __m128i zero = _mm_setzero_si128(); + in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *) + (input + 2 * stride))); + in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *) + (input + 3 * stride))); + + tmp = _mm_add_epi16(in0, in1); + in0 = _mm_unpacklo_epi16(zero, tmp); + in1 = _mm_unpackhi_epi16(zero, tmp); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + tmp = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(tmp, zero); + in1 = _mm_unpackhi_epi32(tmp, zero); + + tmp = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(tmp, 8); + + in1 = _mm_add_epi32(tmp, in0); + in0 = _mm_slli_epi32(in1, 1); + store_output(&in0, output); +} + +void vp9_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i u0, u1, sum; + + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + + sum = _mm_add_epi16(u0, u1); + + in0 = _mm_add_epi16(in0, in1); + in2 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, in0); + + u0 = _mm_setzero_si128(); + sum = _mm_add_epi16(sum, in2); + + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + store_output(&in1, output); +} + +void vp9_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, + int stride) { + __m128i in0, in1, in2, in3; + __m128i u0, u1; + __m128i sum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 2; ++i) { + input += 8 * i; + in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 8 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 9 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 10 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 11 * stride)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 12 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 13 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 14 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 15 * stride)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + sum = _mm_add_epi16(sum, u1); + } + + u0 = _mm_setzero_si128(); + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + in1 = _mm_srai_epi32(in1, 1); + store_output(&in1, output); +} + +void vp9_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, + int stride) { + __m128i in0, in1, in2, in3; + __m128i u0, u1; + __m128i sum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 8; ++i) { + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + sum = _mm_add_epi16(sum, u1); + } + + u0 = _mm_setzero_si128(); + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + in1 = _mm_srai_epi32(in1, 3); + store_output(&in1, output); +} #define DCT_HIGH_BIT_DEPTH 0 #define FDCT4x4_2D vp9_fdct4x4_sse2