Add vp9_highbd_iht16x16_256_add_sse4_1()
BUG=webm:1413 Change-Id: I8d7eeae1bd219eb848c1a86071046a477f7a91af
This commit is contained in:
parent
8fd648c78a
commit
231012fdab
@ -597,7 +597,9 @@ class TransHT : public TransTestBase {
|
||||
TransHT() { fwd_txfm_ref = fht_ref; }
|
||||
};
|
||||
|
||||
TEST_P(TransHT, AccuracyCheck) { RunAccuracyCheck(1); }
|
||||
TEST_P(TransHT, AccuracyCheck) {
|
||||
RunAccuracyCheck(size_ == 16 && bit_depth_ > 10 ? 2 : 1);
|
||||
}
|
||||
|
||||
TEST_P(TransHT, CoeffCheck) { RunCoeffCheck(); }
|
||||
|
||||
@ -605,17 +607,6 @@ TEST_P(TransHT, MemCheck) { RunMemCheck(); }
|
||||
|
||||
TEST_P(TransHT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
|
||||
|
||||
/* TODO:(johannkoenig) Determine why these fail AccuracyCheck
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 0, VPX_BITS_12, 2),
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 1, VPX_BITS_12, 2),
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 2, VPX_BITS_12, 2),
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3, VPX_BITS_12, 2),
|
||||
*/
|
||||
|
||||
const DctParam c_ht_tests[] = {
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
@ -642,6 +633,19 @@ const DctParam c_ht_tests[] = {
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3,
|
||||
VPX_BITS_10, 2),
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 0,
|
||||
VPX_BITS_12, 2),
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 1,
|
||||
VPX_BITS_12, 2),
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 2,
|
||||
VPX_BITS_12, 2),
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3,
|
||||
VPX_BITS_12, 2),
|
||||
|
||||
make_tuple(&vp9_highbd_fht8x8_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_c>, 8, 0, VPX_BITS_8,
|
||||
2),
|
||||
@ -784,6 +788,43 @@ INSTANTIATE_TEST_CASE_P(
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE4_1, TransHT,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
|
||||
0, VPX_BITS_8, 2),
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
|
||||
1, VPX_BITS_8, 2),
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
|
||||
2, VPX_BITS_8, 2),
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
|
||||
3, VPX_BITS_8, 2),
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
|
||||
0, VPX_BITS_10, 2),
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
|
||||
1, VPX_BITS_10, 2),
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
|
||||
2, VPX_BITS_10, 2),
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
|
||||
3, VPX_BITS_10, 2),
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
|
||||
0, VPX_BITS_12, 2),
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
|
||||
1, VPX_BITS_12, 2),
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
|
||||
2, VPX_BITS_12, 2),
|
||||
make_tuple(&vp9_highbd_fht16x16_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
|
||||
3, VPX_BITS_12, 2),
|
||||
|
||||
make_tuple(&vp9_highbd_fht8x8_c,
|
||||
&highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 0,
|
||||
VPX_BITS_8, 2),
|
||||
|
@ -105,6 +105,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
|
||||
specialize qw/vp9_highbd_iht4x4_16_add sse4_1/;
|
||||
specialize qw/vp9_highbd_iht8x8_64_add sse4_1/;
|
||||
specialize qw/vp9_highbd_iht16x16_256_add sse4_1/;
|
||||
}
|
||||
}
|
||||
|
||||
|
419
vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
Normal file
419
vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
Normal file
@ -0,0 +1,419 @@
|
||||
/*
|
||||
* Copyright (c) 2018 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "vp9/common/vp9_idct.h"
|
||||
#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
|
||||
#include "vpx_dsp/x86/inv_txfm_sse2.h"
|
||||
#include "vpx_dsp/x86/transpose_sse2.h"
|
||||
#include "vpx_dsp/x86/txfm_common_sse2.h"
|
||||
|
||||
static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
|
||||
const int c,
|
||||
__m128i *const s) {
|
||||
const __m128i pair_c = pair_set_epi32(4 * c, 0);
|
||||
__m128i x[2];
|
||||
|
||||
extend_64bit(in, x);
|
||||
s[0] = _mm_mul_epi32(pair_c, x[0]);
|
||||
s[1] = _mm_mul_epi32(pair_c, x[1]);
|
||||
}
|
||||
|
||||
static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
|
||||
const __m128i in1,
|
||||
const int c0, const int c1,
|
||||
__m128i *const s0,
|
||||
__m128i *const s1) {
|
||||
const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
|
||||
const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
|
||||
__m128i t00[2], t01[2], t10[2], t11[2];
|
||||
__m128i x0[2], x1[2];
|
||||
|
||||
extend_64bit(in0, x0);
|
||||
extend_64bit(in1, x1);
|
||||
t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
|
||||
t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
|
||||
t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
|
||||
t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
|
||||
t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
|
||||
t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
|
||||
t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
|
||||
t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
|
||||
|
||||
s0[0] = _mm_add_epi64(t00[0], t11[0]);
|
||||
s0[1] = _mm_add_epi64(t00[1], t11[1]);
|
||||
s1[0] = _mm_sub_epi64(t10[0], t01[0]);
|
||||
s1[1] = _mm_sub_epi64(t10[1], t01[1]);
|
||||
}
|
||||
|
||||
static void highbd_iadst16_4col_sse4_1(__m128i *const io /*io[16]*/) {
|
||||
__m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2], s8[2], s9[2],
|
||||
s10[2], s11[2], s12[2], s13[2], s14[2], s15[2];
|
||||
__m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2], x8[2], x9[2],
|
||||
x10[2], x11[2], x12[2], x13[2], x14[2], x15[2];
|
||||
|
||||
// stage 1
|
||||
highbd_iadst_butterfly_sse4_1(io[15], io[0], cospi_1_64, cospi_31_64, s0, s1);
|
||||
highbd_iadst_butterfly_sse4_1(io[13], io[2], cospi_5_64, cospi_27_64, s2, s3);
|
||||
highbd_iadst_butterfly_sse4_1(io[11], io[4], cospi_9_64, cospi_23_64, s4, s5);
|
||||
highbd_iadst_butterfly_sse4_1(io[9], io[6], cospi_13_64, cospi_19_64, s6, s7);
|
||||
highbd_iadst_butterfly_sse4_1(io[7], io[8], cospi_17_64, cospi_15_64, s8, s9);
|
||||
highbd_iadst_butterfly_sse4_1(io[5], io[10], cospi_21_64, cospi_11_64, s10,
|
||||
s11);
|
||||
highbd_iadst_butterfly_sse4_1(io[3], io[12], cospi_25_64, cospi_7_64, s12,
|
||||
s13);
|
||||
highbd_iadst_butterfly_sse4_1(io[1], io[14], cospi_29_64, cospi_3_64, s14,
|
||||
s15);
|
||||
|
||||
x0[0] = _mm_add_epi64(s0[0], s8[0]);
|
||||
x0[1] = _mm_add_epi64(s0[1], s8[1]);
|
||||
x1[0] = _mm_add_epi64(s1[0], s9[0]);
|
||||
x1[1] = _mm_add_epi64(s1[1], s9[1]);
|
||||
x2[0] = _mm_add_epi64(s2[0], s10[0]);
|
||||
x2[1] = _mm_add_epi64(s2[1], s10[1]);
|
||||
x3[0] = _mm_add_epi64(s3[0], s11[0]);
|
||||
x3[1] = _mm_add_epi64(s3[1], s11[1]);
|
||||
x4[0] = _mm_add_epi64(s4[0], s12[0]);
|
||||
x4[1] = _mm_add_epi64(s4[1], s12[1]);
|
||||
x5[0] = _mm_add_epi64(s5[0], s13[0]);
|
||||
x5[1] = _mm_add_epi64(s5[1], s13[1]);
|
||||
x6[0] = _mm_add_epi64(s6[0], s14[0]);
|
||||
x6[1] = _mm_add_epi64(s6[1], s14[1]);
|
||||
x7[0] = _mm_add_epi64(s7[0], s15[0]);
|
||||
x7[1] = _mm_add_epi64(s7[1], s15[1]);
|
||||
x8[0] = _mm_sub_epi64(s0[0], s8[0]);
|
||||
x8[1] = _mm_sub_epi64(s0[1], s8[1]);
|
||||
x9[0] = _mm_sub_epi64(s1[0], s9[0]);
|
||||
x9[1] = _mm_sub_epi64(s1[1], s9[1]);
|
||||
x10[0] = _mm_sub_epi64(s2[0], s10[0]);
|
||||
x10[1] = _mm_sub_epi64(s2[1], s10[1]);
|
||||
x11[0] = _mm_sub_epi64(s3[0], s11[0]);
|
||||
x11[1] = _mm_sub_epi64(s3[1], s11[1]);
|
||||
x12[0] = _mm_sub_epi64(s4[0], s12[0]);
|
||||
x12[1] = _mm_sub_epi64(s4[1], s12[1]);
|
||||
x13[0] = _mm_sub_epi64(s5[0], s13[0]);
|
||||
x13[1] = _mm_sub_epi64(s5[1], s13[1]);
|
||||
x14[0] = _mm_sub_epi64(s6[0], s14[0]);
|
||||
x14[1] = _mm_sub_epi64(s6[1], s14[1]);
|
||||
x15[0] = _mm_sub_epi64(s7[0], s15[0]);
|
||||
x15[1] = _mm_sub_epi64(s7[1], s15[1]);
|
||||
|
||||
x0[0] = dct_const_round_shift_64bit(x0[0]);
|
||||
x0[1] = dct_const_round_shift_64bit(x0[1]);
|
||||
x1[0] = dct_const_round_shift_64bit(x1[0]);
|
||||
x1[1] = dct_const_round_shift_64bit(x1[1]);
|
||||
x2[0] = dct_const_round_shift_64bit(x2[0]);
|
||||
x2[1] = dct_const_round_shift_64bit(x2[1]);
|
||||
x3[0] = dct_const_round_shift_64bit(x3[0]);
|
||||
x3[1] = dct_const_round_shift_64bit(x3[1]);
|
||||
x4[0] = dct_const_round_shift_64bit(x4[0]);
|
||||
x4[1] = dct_const_round_shift_64bit(x4[1]);
|
||||
x5[0] = dct_const_round_shift_64bit(x5[0]);
|
||||
x5[1] = dct_const_round_shift_64bit(x5[1]);
|
||||
x6[0] = dct_const_round_shift_64bit(x6[0]);
|
||||
x6[1] = dct_const_round_shift_64bit(x6[1]);
|
||||
x7[0] = dct_const_round_shift_64bit(x7[0]);
|
||||
x7[1] = dct_const_round_shift_64bit(x7[1]);
|
||||
x8[0] = dct_const_round_shift_64bit(x8[0]);
|
||||
x8[1] = dct_const_round_shift_64bit(x8[1]);
|
||||
x9[0] = dct_const_round_shift_64bit(x9[0]);
|
||||
x9[1] = dct_const_round_shift_64bit(x9[1]);
|
||||
x10[0] = dct_const_round_shift_64bit(x10[0]);
|
||||
x10[1] = dct_const_round_shift_64bit(x10[1]);
|
||||
x11[0] = dct_const_round_shift_64bit(x11[0]);
|
||||
x11[1] = dct_const_round_shift_64bit(x11[1]);
|
||||
x12[0] = dct_const_round_shift_64bit(x12[0]);
|
||||
x12[1] = dct_const_round_shift_64bit(x12[1]);
|
||||
x13[0] = dct_const_round_shift_64bit(x13[0]);
|
||||
x13[1] = dct_const_round_shift_64bit(x13[1]);
|
||||
x14[0] = dct_const_round_shift_64bit(x14[0]);
|
||||
x14[1] = dct_const_round_shift_64bit(x14[1]);
|
||||
x15[0] = dct_const_round_shift_64bit(x15[0]);
|
||||
x15[1] = dct_const_round_shift_64bit(x15[1]);
|
||||
x0[0] = pack_4(x0[0], x0[1]);
|
||||
x1[0] = pack_4(x1[0], x1[1]);
|
||||
x2[0] = pack_4(x2[0], x2[1]);
|
||||
x3[0] = pack_4(x3[0], x3[1]);
|
||||
x4[0] = pack_4(x4[0], x4[1]);
|
||||
x5[0] = pack_4(x5[0], x5[1]);
|
||||
x6[0] = pack_4(x6[0], x6[1]);
|
||||
x7[0] = pack_4(x7[0], x7[1]);
|
||||
x8[0] = pack_4(x8[0], x8[1]);
|
||||
x9[0] = pack_4(x9[0], x9[1]);
|
||||
x10[0] = pack_4(x10[0], x10[1]);
|
||||
x11[0] = pack_4(x11[0], x11[1]);
|
||||
x12[0] = pack_4(x12[0], x12[1]);
|
||||
x13[0] = pack_4(x13[0], x13[1]);
|
||||
x14[0] = pack_4(x14[0], x14[1]);
|
||||
x15[0] = pack_4(x15[0], x15[1]);
|
||||
|
||||
// stage 2
|
||||
s0[0] = x0[0];
|
||||
s1[0] = x1[0];
|
||||
s2[0] = x2[0];
|
||||
s3[0] = x3[0];
|
||||
s4[0] = x4[0];
|
||||
s5[0] = x5[0];
|
||||
s6[0] = x6[0];
|
||||
s7[0] = x7[0];
|
||||
x0[0] = _mm_add_epi32(s0[0], s4[0]);
|
||||
x1[0] = _mm_add_epi32(s1[0], s5[0]);
|
||||
x2[0] = _mm_add_epi32(s2[0], s6[0]);
|
||||
x3[0] = _mm_add_epi32(s3[0], s7[0]);
|
||||
x4[0] = _mm_sub_epi32(s0[0], s4[0]);
|
||||
x5[0] = _mm_sub_epi32(s1[0], s5[0]);
|
||||
x6[0] = _mm_sub_epi32(s2[0], s6[0]);
|
||||
x7[0] = _mm_sub_epi32(s3[0], s7[0]);
|
||||
|
||||
highbd_iadst_butterfly_sse4_1(x8[0], x9[0], cospi_4_64, cospi_28_64, s8, s9);
|
||||
highbd_iadst_butterfly_sse4_1(x10[0], x11[0], cospi_20_64, cospi_12_64, s10,
|
||||
s11);
|
||||
highbd_iadst_butterfly_sse4_1(x13[0], x12[0], cospi_28_64, cospi_4_64, s13,
|
||||
s12);
|
||||
highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_12_64, cospi_20_64, s15,
|
||||
s14);
|
||||
|
||||
x8[0] = _mm_add_epi64(s8[0], s12[0]);
|
||||
x8[1] = _mm_add_epi64(s8[1], s12[1]);
|
||||
x9[0] = _mm_add_epi64(s9[0], s13[0]);
|
||||
x9[1] = _mm_add_epi64(s9[1], s13[1]);
|
||||
x10[0] = _mm_add_epi64(s10[0], s14[0]);
|
||||
x10[1] = _mm_add_epi64(s10[1], s14[1]);
|
||||
x11[0] = _mm_add_epi64(s11[0], s15[0]);
|
||||
x11[1] = _mm_add_epi64(s11[1], s15[1]);
|
||||
x12[0] = _mm_sub_epi64(s8[0], s12[0]);
|
||||
x12[1] = _mm_sub_epi64(s8[1], s12[1]);
|
||||
x13[0] = _mm_sub_epi64(s9[0], s13[0]);
|
||||
x13[1] = _mm_sub_epi64(s9[1], s13[1]);
|
||||
x14[0] = _mm_sub_epi64(s10[0], s14[0]);
|
||||
x14[1] = _mm_sub_epi64(s10[1], s14[1]);
|
||||
x15[0] = _mm_sub_epi64(s11[0], s15[0]);
|
||||
x15[1] = _mm_sub_epi64(s11[1], s15[1]);
|
||||
x8[0] = dct_const_round_shift_64bit(x8[0]);
|
||||
x8[1] = dct_const_round_shift_64bit(x8[1]);
|
||||
x9[0] = dct_const_round_shift_64bit(x9[0]);
|
||||
x9[1] = dct_const_round_shift_64bit(x9[1]);
|
||||
x10[0] = dct_const_round_shift_64bit(x10[0]);
|
||||
x10[1] = dct_const_round_shift_64bit(x10[1]);
|
||||
x11[0] = dct_const_round_shift_64bit(x11[0]);
|
||||
x11[1] = dct_const_round_shift_64bit(x11[1]);
|
||||
x12[0] = dct_const_round_shift_64bit(x12[0]);
|
||||
x12[1] = dct_const_round_shift_64bit(x12[1]);
|
||||
x13[0] = dct_const_round_shift_64bit(x13[0]);
|
||||
x13[1] = dct_const_round_shift_64bit(x13[1]);
|
||||
x14[0] = dct_const_round_shift_64bit(x14[0]);
|
||||
x14[1] = dct_const_round_shift_64bit(x14[1]);
|
||||
x15[0] = dct_const_round_shift_64bit(x15[0]);
|
||||
x15[1] = dct_const_round_shift_64bit(x15[1]);
|
||||
x8[0] = pack_4(x8[0], x8[1]);
|
||||
x9[0] = pack_4(x9[0], x9[1]);
|
||||
x10[0] = pack_4(x10[0], x10[1]);
|
||||
x11[0] = pack_4(x11[0], x11[1]);
|
||||
x12[0] = pack_4(x12[0], x12[1]);
|
||||
x13[0] = pack_4(x13[0], x13[1]);
|
||||
x14[0] = pack_4(x14[0], x14[1]);
|
||||
x15[0] = pack_4(x15[0], x15[1]);
|
||||
|
||||
// stage 3
|
||||
s0[0] = x0[0];
|
||||
s1[0] = x1[0];
|
||||
s2[0] = x2[0];
|
||||
s3[0] = x3[0];
|
||||
highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
|
||||
highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
|
||||
s8[0] = x8[0];
|
||||
s9[0] = x9[0];
|
||||
s10[0] = x10[0];
|
||||
s11[0] = x11[0];
|
||||
highbd_iadst_butterfly_sse4_1(x12[0], x13[0], cospi_8_64, cospi_24_64, s12,
|
||||
s13);
|
||||
highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_24_64, cospi_8_64, s15,
|
||||
s14);
|
||||
|
||||
x0[0] = _mm_add_epi32(s0[0], s2[0]);
|
||||
x1[0] = _mm_add_epi32(s1[0], s3[0]);
|
||||
x2[0] = _mm_sub_epi32(s0[0], s2[0]);
|
||||
x3[0] = _mm_sub_epi32(s1[0], s3[0]);
|
||||
x4[0] = _mm_add_epi64(s4[0], s6[0]);
|
||||
x4[1] = _mm_add_epi64(s4[1], s6[1]);
|
||||
x5[0] = _mm_add_epi64(s5[0], s7[0]);
|
||||
x5[1] = _mm_add_epi64(s5[1], s7[1]);
|
||||
x6[0] = _mm_sub_epi64(s4[0], s6[0]);
|
||||
x6[1] = _mm_sub_epi64(s4[1], s6[1]);
|
||||
x7[0] = _mm_sub_epi64(s5[0], s7[0]);
|
||||
x7[1] = _mm_sub_epi64(s5[1], s7[1]);
|
||||
x4[0] = dct_const_round_shift_64bit(x4[0]);
|
||||
x4[1] = dct_const_round_shift_64bit(x4[1]);
|
||||
x5[0] = dct_const_round_shift_64bit(x5[0]);
|
||||
x5[1] = dct_const_round_shift_64bit(x5[1]);
|
||||
x6[0] = dct_const_round_shift_64bit(x6[0]);
|
||||
x6[1] = dct_const_round_shift_64bit(x6[1]);
|
||||
x7[0] = dct_const_round_shift_64bit(x7[0]);
|
||||
x7[1] = dct_const_round_shift_64bit(x7[1]);
|
||||
x4[0] = pack_4(x4[0], x4[1]);
|
||||
x5[0] = pack_4(x5[0], x5[1]);
|
||||
x6[0] = pack_4(x6[0], x6[1]);
|
||||
x7[0] = pack_4(x7[0], x7[1]);
|
||||
x8[0] = _mm_add_epi32(s8[0], s10[0]);
|
||||
x9[0] = _mm_add_epi32(s9[0], s11[0]);
|
||||
x10[0] = _mm_sub_epi32(s8[0], s10[0]);
|
||||
x11[0] = _mm_sub_epi32(s9[0], s11[0]);
|
||||
x12[0] = _mm_add_epi64(s12[0], s14[0]);
|
||||
x12[1] = _mm_add_epi64(s12[1], s14[1]);
|
||||
x13[0] = _mm_add_epi64(s13[0], s15[0]);
|
||||
x13[1] = _mm_add_epi64(s13[1], s15[1]);
|
||||
x14[0] = _mm_sub_epi64(s12[0], s14[0]);
|
||||
x14[1] = _mm_sub_epi64(s12[1], s14[1]);
|
||||
x15[0] = _mm_sub_epi64(s13[0], s15[0]);
|
||||
x15[1] = _mm_sub_epi64(s13[1], s15[1]);
|
||||
x12[0] = dct_const_round_shift_64bit(x12[0]);
|
||||
x12[1] = dct_const_round_shift_64bit(x12[1]);
|
||||
x13[0] = dct_const_round_shift_64bit(x13[0]);
|
||||
x13[1] = dct_const_round_shift_64bit(x13[1]);
|
||||
x14[0] = dct_const_round_shift_64bit(x14[0]);
|
||||
x14[1] = dct_const_round_shift_64bit(x14[1]);
|
||||
x15[0] = dct_const_round_shift_64bit(x15[0]);
|
||||
x15[1] = dct_const_round_shift_64bit(x15[1]);
|
||||
x12[0] = pack_4(x12[0], x12[1]);
|
||||
x13[0] = pack_4(x13[0], x13[1]);
|
||||
x14[0] = pack_4(x14[0], x14[1]);
|
||||
x15[0] = pack_4(x15[0], x15[1]);
|
||||
|
||||
// stage 4
|
||||
s2[0] = _mm_add_epi32(x2[0], x3[0]);
|
||||
s3[0] = _mm_sub_epi32(x2[0], x3[0]);
|
||||
s6[0] = _mm_add_epi32(x7[0], x6[0]);
|
||||
s7[0] = _mm_sub_epi32(x7[0], x6[0]);
|
||||
s10[0] = _mm_add_epi32(x11[0], x10[0]);
|
||||
s11[0] = _mm_sub_epi32(x11[0], x10[0]);
|
||||
s14[0] = _mm_add_epi32(x14[0], x15[0]);
|
||||
s15[0] = _mm_sub_epi32(x14[0], x15[0]);
|
||||
highbd_iadst_half_butterfly_sse4_1(s2[0], -cospi_16_64, s2);
|
||||
highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
|
||||
highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
|
||||
highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
|
||||
highbd_iadst_half_butterfly_sse4_1(s10[0], cospi_16_64, s10);
|
||||
highbd_iadst_half_butterfly_sse4_1(s11[0], cospi_16_64, s11);
|
||||
highbd_iadst_half_butterfly_sse4_1(s14[0], -cospi_16_64, s14);
|
||||
highbd_iadst_half_butterfly_sse4_1(s15[0], cospi_16_64, s15);
|
||||
|
||||
x2[0] = dct_const_round_shift_64bit(s2[0]);
|
||||
x2[1] = dct_const_round_shift_64bit(s2[1]);
|
||||
x3[0] = dct_const_round_shift_64bit(s3[0]);
|
||||
x3[1] = dct_const_round_shift_64bit(s3[1]);
|
||||
x6[0] = dct_const_round_shift_64bit(s6[0]);
|
||||
x6[1] = dct_const_round_shift_64bit(s6[1]);
|
||||
x7[0] = dct_const_round_shift_64bit(s7[0]);
|
||||
x7[1] = dct_const_round_shift_64bit(s7[1]);
|
||||
x10[0] = dct_const_round_shift_64bit(s10[0]);
|
||||
x10[1] = dct_const_round_shift_64bit(s10[1]);
|
||||
x11[0] = dct_const_round_shift_64bit(s11[0]);
|
||||
x11[1] = dct_const_round_shift_64bit(s11[1]);
|
||||
x14[0] = dct_const_round_shift_64bit(s14[0]);
|
||||
x14[1] = dct_const_round_shift_64bit(s14[1]);
|
||||
x15[0] = dct_const_round_shift_64bit(s15[0]);
|
||||
x15[1] = dct_const_round_shift_64bit(s15[1]);
|
||||
x2[0] = pack_4(x2[0], x2[1]);
|
||||
x3[0] = pack_4(x3[0], x3[1]);
|
||||
x6[0] = pack_4(x6[0], x6[1]);
|
||||
x7[0] = pack_4(x7[0], x7[1]);
|
||||
x10[0] = pack_4(x10[0], x10[1]);
|
||||
x11[0] = pack_4(x11[0], x11[1]);
|
||||
x14[0] = pack_4(x14[0], x14[1]);
|
||||
x15[0] = pack_4(x15[0], x15[1]);
|
||||
|
||||
io[0] = x0[0];
|
||||
io[1] = _mm_sub_epi32(_mm_setzero_si128(), x8[0]);
|
||||
io[2] = x12[0];
|
||||
io[3] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
|
||||
io[4] = x6[0];
|
||||
io[5] = x14[0];
|
||||
io[6] = x10[0];
|
||||
io[7] = x2[0];
|
||||
io[8] = x3[0];
|
||||
io[9] = x11[0];
|
||||
io[10] = x15[0];
|
||||
io[11] = x7[0];
|
||||
io[12] = x5[0];
|
||||
io[13] = _mm_sub_epi32(_mm_setzero_si128(), x13[0]);
|
||||
io[14] = x9[0];
|
||||
io[15] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
|
||||
}
|
||||
|
||||
void vp9_highbd_iht16x16_256_add_sse4_1(const tran_low_t *input, uint16_t *dest,
|
||||
int stride, int tx_type, int bd) {
|
||||
int i;
|
||||
__m128i out[16], *in;
|
||||
|
||||
if (bd == 8) {
|
||||
__m128i l[16], r[16];
|
||||
|
||||
in = l;
|
||||
for (i = 0; i < 2; i++) {
|
||||
highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
|
||||
highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
|
||||
if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
|
||||
idct16_8col(in, in);
|
||||
} else {
|
||||
vpx_iadst16_8col_sse2(in);
|
||||
}
|
||||
in = r;
|
||||
input += 128;
|
||||
}
|
||||
|
||||
for (i = 0; i < 16; i += 8) {
|
||||
int j;
|
||||
transpose_16bit_8x8(l + i, out);
|
||||
transpose_16bit_8x8(r + i, out + 8);
|
||||
if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
|
||||
idct16_8col(out, out);
|
||||
} else {
|
||||
vpx_iadst16_8col_sse2(out);
|
||||
}
|
||||
|
||||
for (j = 0; j < 16; ++j) {
|
||||
highbd_write_buffer_8(dest + j * stride, out[j], bd);
|
||||
}
|
||||
dest += 8;
|
||||
}
|
||||
} else {
|
||||
__m128i all[4][16];
|
||||
|
||||
for (i = 0; i < 4; i++) {
|
||||
in = all[i];
|
||||
highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
|
||||
highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
|
||||
if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
|
||||
vpx_highbd_idct16_4col_sse4_1(in);
|
||||
} else {
|
||||
highbd_iadst16_4col_sse4_1(in);
|
||||
}
|
||||
input += 4 * 16;
|
||||
}
|
||||
|
||||
for (i = 0; i < 16; i += 4) {
|
||||
int j;
|
||||
transpose_32bit_4x4(all[0] + i, out + 0);
|
||||
transpose_32bit_4x4(all[1] + i, out + 4);
|
||||
transpose_32bit_4x4(all[2] + i, out + 8);
|
||||
transpose_32bit_4x4(all[3] + i, out + 12);
|
||||
if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
|
||||
vpx_highbd_idct16_4col_sse4_1(out);
|
||||
} else {
|
||||
highbd_iadst16_4col_sse4_1(out);
|
||||
}
|
||||
|
||||
for (j = 0; j < 16; ++j) {
|
||||
highbd_write_buffer_4(dest + j * stride, out[j], bd);
|
||||
}
|
||||
dest += 4;
|
||||
}
|
||||
}
|
||||
}
|
@ -83,6 +83,7 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
|
||||
else
|
||||
VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht4x4_add_sse4.c
|
||||
VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht8x8_add_sse4.c
|
||||
VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht16x16_add_sse4.c
|
||||
endif
|
||||
|
||||
$(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))
|
||||
|
@ -53,7 +53,7 @@ static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
|
||||
out[15] = in[15];
|
||||
}
|
||||
|
||||
static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
|
||||
void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/) {
|
||||
__m128i step1[16], step2[16];
|
||||
|
||||
// stage 2
|
||||
@ -233,7 +233,7 @@ void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
|
||||
in = all[i];
|
||||
highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
|
||||
highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
|
||||
highbd_idct16_4col(in);
|
||||
vpx_highbd_idct16_4col_sse4_1(in);
|
||||
input += 4 * 16;
|
||||
}
|
||||
|
||||
@ -243,7 +243,7 @@ void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
|
||||
transpose_32bit_4x4(all[1] + i, out + 4);
|
||||
transpose_32bit_4x4(all[2] + i, out + 8);
|
||||
transpose_32bit_4x4(all[3] + i, out + 12);
|
||||
highbd_idct16_4col(out);
|
||||
vpx_highbd_idct16_4col_sse4_1(out);
|
||||
|
||||
for (j = 0; j < 16; ++j) {
|
||||
highbd_write_buffer_4(dest + j * stride, out[j], bd);
|
||||
|
@ -107,5 +107,6 @@ static INLINE void highbd_idct4_sse4_1(__m128i *const io) {
|
||||
}
|
||||
|
||||
void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io);
|
||||
void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/);
|
||||
|
||||
#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
|
||||
|
@ -514,7 +514,7 @@ void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
|
||||
}
|
||||
}
|
||||
|
||||
static void iadst16_8col(__m128i *const in) {
|
||||
void vpx_iadst16_8col_sse2(__m128i *const in) {
|
||||
// perform 16x16 1-D ADST for 8 columns
|
||||
__m128i s[16], x[16], u[32], v[32];
|
||||
const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
|
||||
@ -874,8 +874,8 @@ void idct16_sse2(__m128i *const in0, __m128i *const in1) {
|
||||
|
||||
void iadst16_sse2(__m128i *const in0, __m128i *const in1) {
|
||||
transpose_16bit_16x16(in0, in1);
|
||||
iadst16_8col(in0);
|
||||
iadst16_8col(in1);
|
||||
vpx_iadst16_8col_sse2(in0);
|
||||
vpx_iadst16_8col_sse2(in1);
|
||||
}
|
||||
|
||||
// Group the coefficient calculation into smaller functions to prevent stack
|
||||
|
@ -701,6 +701,7 @@ void vpx_idct8_sse2(__m128i *const in);
|
||||
void idct16_sse2(__m128i *const in0, __m128i *const in1);
|
||||
void iadst4_sse2(__m128i *const in);
|
||||
void iadst8_sse2(__m128i *const in);
|
||||
void vpx_iadst16_8col_sse2(__m128i *const in);
|
||||
void iadst16_sse2(__m128i *const in0, __m128i *const in1);
|
||||
void idct32_1024_8x32(const __m128i *const in, __m128i *const out);
|
||||
void idct32_34_8x32_sse2(const __m128i *const in, __m128i *const out);
|
||||
|
Loading…
x
Reference in New Issue
Block a user