From 39da7fb786eb05661db3ec866660772ae1583f2b Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Tue, 8 Aug 2017 17:39:04 -0700 Subject: [PATCH] Clean highbd idct x86 code with inline functions Created inline functions highbd_butterfly_cospi16_sse2() and highbd_butterfly_cospi16_sse4_1() BUG=webm:1412 Change-Id: Icbc53a73712b6207379872a5e88d0a4d09e2322a --- vpx_dsp/x86/highbd_idct16x16_add_sse2.c | 32 ++---------- vpx_dsp/x86/highbd_idct16x16_add_sse4.c | 66 ++++++++----------------- vpx_dsp/x86/highbd_idct4x4_add_sse2.c | 9 +--- vpx_dsp/x86/highbd_idct8x8_add_sse2.c | 25 ++-------- vpx_dsp/x86/highbd_idct8x8_add_sse4.c | 25 ++-------- vpx_dsp/x86/highbd_inv_txfm_sse2.h | 14 ++++++ vpx_dsp/x86/highbd_inv_txfm_sse4.h | 14 ++++++ 7 files changed, 64 insertions(+), 121 deletions(-) diff --git a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c index 8192f09ef..280072d0b 100644 --- a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c @@ -18,18 +18,12 @@ static INLINE void highbd_idct16_4col_stage5(const __m128i *const in, __m128i *const out) { - __m128i temp1[2], temp2, sign[2]; // stage 5 out[0] = _mm_add_epi32(in[0], in[3]); out[1] = _mm_add_epi32(in[1], in[2]); out[2] = _mm_sub_epi32(in[1], in[2]); out[3] = _mm_sub_epi32(in[0], in[3]); - temp2 = _mm_sub_epi32(in[6], in[5]); - abs_extend_64bit_sse2(temp2, temp1, sign); - out[5] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); - temp2 = _mm_add_epi32(in[6], in[5]); - abs_extend_64bit_sse2(temp2, temp1, sign); - out[6] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); + highbd_butterfly_cospi16_sse2(in[6], in[5], &out[6], &out[5]); out[8] = _mm_add_epi32(in[8], in[11]); out[9] = _mm_add_epi32(in[9], in[10]); out[10] = _mm_sub_epi32(in[9], in[10]); @@ -42,7 +36,6 @@ static INLINE void highbd_idct16_4col_stage5(const __m128i *const in, static INLINE void highbd_idct16_4col_stage6(const __m128i *const in, __m128i *const out) { - __m128i temp1[2], temp2, sign[2]; out[0] = _mm_add_epi32(in[0], in[7]); out[1] = _mm_add_epi32(in[1], in[6]); out[2] = _mm_add_epi32(in[2], in[5]); @@ -53,26 +46,14 @@ static INLINE void highbd_idct16_4col_stage6(const __m128i *const in, out[7] = _mm_sub_epi32(in[0], in[7]); out[8] = in[8]; out[9] = in[9]; - temp2 = _mm_sub_epi32(in[13], in[10]); - abs_extend_64bit_sse2(temp2, temp1, sign); - out[10] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); - temp2 = _mm_add_epi32(in[13], in[10]); - abs_extend_64bit_sse2(temp2, temp1, sign); - out[13] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); - - temp2 = _mm_sub_epi32(in[12], in[11]); - abs_extend_64bit_sse2(temp2, temp1, sign); - out[11] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); - temp2 = _mm_add_epi32(in[12], in[11]); - abs_extend_64bit_sse2(temp2, temp1, sign); - out[12] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); + highbd_butterfly_cospi16_sse2(in[13], in[10], &out[13], &out[10]); + highbd_butterfly_cospi16_sse2(in[12], in[11], &out[12], &out[11]); out[14] = in[14]; out[15] = in[15]; } static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) { __m128i step1[16], step2[16]; - __m128i temp1[4], temp2, sign[2]; // stage 2 highbd_butterfly_sse2(io[1], io[15], (int)cospi_30_64, (int)cospi_2_64, @@ -99,12 +80,7 @@ static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) { step1[15] = _mm_add_epi32(step2[15], step2[14]); // stage 4 - temp2 = _mm_add_epi32(io[0], io[8]); - abs_extend_64bit_sse2(temp2, temp1, sign); - step2[0] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); - temp2 = _mm_sub_epi32(io[0], io[8]); - abs_extend_64bit_sse2(temp2, temp1, sign); - step2[1] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); + highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]); highbd_butterfly_sse2(io[4], io[12], (int)cospi_24_64, (int)cospi_8_64, &step2[2], &step2[3]); highbd_butterfly_sse2(step1[14], step1[9], (int)cospi_24_64, (int)cospi_8_64, diff --git a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c index 8cc0d0836..090ed5f94 100644 --- a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c +++ b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c @@ -19,18 +19,12 @@ static INLINE void highbd_idct16_4col_stage5(const __m128i *const in, __m128i *const out) { - __m128i temp1[2], temp2; // stage 5 out[0] = _mm_add_epi32(in[0], in[3]); out[1] = _mm_add_epi32(in[1], in[2]); out[2] = _mm_sub_epi32(in[1], in[2]); out[3] = _mm_sub_epi32(in[0], in[3]); - temp2 = _mm_sub_epi32(in[6], in[5]); - extend_64bit(temp2, temp1); - out[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); - temp2 = _mm_add_epi32(in[6], in[5]); - extend_64bit(temp2, temp1); - out[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); + highbd_butterfly_cospi16_sse4_1(in[6], in[5], &out[6], &out[5]); out[8] = _mm_add_epi32(in[8], in[11]); out[9] = _mm_add_epi32(in[9], in[10]); out[10] = _mm_sub_epi32(in[9], in[10]); @@ -43,7 +37,6 @@ static INLINE void highbd_idct16_4col_stage5(const __m128i *const in, static INLINE void highbd_idct16_4col_stage6(const __m128i *const in, __m128i *const out) { - __m128i temp1[2], temp2; out[0] = _mm_add_epi32(in[0], in[7]); out[1] = _mm_add_epi32(in[1], in[6]); out[2] = _mm_add_epi32(in[2], in[5]); @@ -54,26 +47,14 @@ static INLINE void highbd_idct16_4col_stage6(const __m128i *const in, out[7] = _mm_sub_epi32(in[0], in[7]); out[8] = in[8]; out[9] = in[9]; - temp2 = _mm_sub_epi32(in[13], in[10]); - extend_64bit(temp2, temp1); - out[10] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); - temp2 = _mm_add_epi32(in[13], in[10]); - extend_64bit(temp2, temp1); - out[13] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); - - temp2 = _mm_sub_epi32(in[12], in[11]); - extend_64bit(temp2, temp1); - out[11] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); - temp2 = _mm_add_epi32(in[12], in[11]); - extend_64bit(temp2, temp1); - out[12] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); + highbd_butterfly_cospi16_sse4_1(in[13], in[10], &out[13], &out[10]); + highbd_butterfly_cospi16_sse4_1(in[12], in[11], &out[12], &out[11]); out[14] = in[14]; out[15] = in[15]; } static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) { __m128i step1[16], step2[16]; - __m128i temp1[4], temp2; // stage 2 highbd_butterfly_sse4_1(io[1], io[15], (int)cospi_30_64, (int)cospi_2_64, @@ -92,26 +73,21 @@ static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) { &step1[5], &step1[6]); step1[8] = _mm_add_epi32(step2[8], step2[9]); step1[9] = _mm_sub_epi32(step2[8], step2[9]); - step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] - step1[11] = _mm_add_epi32(step2[10], step2[11]); - step1[12] = _mm_add_epi32(step2[13], step2[12]); - step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] + step1[10] = _mm_sub_epi32(step2[11], step2[10]); + step1[11] = _mm_add_epi32(step2[11], step2[10]); + step1[12] = _mm_add_epi32(step2[12], step2[13]); + step1[13] = _mm_sub_epi32(step2[12], step2[13]); step1[14] = _mm_sub_epi32(step2[15], step2[14]); step1[15] = _mm_add_epi32(step2[15], step2[14]); // stage 4 - temp2 = _mm_add_epi32(io[0], io[8]); - extend_64bit(temp2, temp1); - step2[0] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); - temp2 = _mm_sub_epi32(io[0], io[8]); - extend_64bit(temp2, temp1); - step2[1] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); + highbd_butterfly_cospi16_sse4_1(io[0], io[8], &step2[0], &step2[1]); highbd_butterfly_sse4_1(io[4], io[12], (int)cospi_24_64, (int)cospi_8_64, &step2[2], &step2[3]); highbd_butterfly_sse4_1(step1[14], step1[9], (int)cospi_24_64, (int)cospi_8_64, &step2[9], &step2[14]); - highbd_butterfly_sse4_1(step1[10], step1[13], (int)cospi_8_64, - (int)cospi_24_64, &step2[13], &step2[10]); + highbd_butterfly_sse4_1(step1[10], step1[13], -(int)cospi_8_64, + -(int)cospi_24_64, &step2[13], &step2[10]); step2[5] = _mm_sub_epi32(step1[4], step1[5]); step1[4] = _mm_add_epi32(step1[4], step1[5]); step2[6] = _mm_sub_epi32(step1[7], step1[6]); @@ -147,10 +123,10 @@ static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) { &step1[5], &step1[6]); step1[8] = _mm_add_epi32(step2[8], step2[9]); step1[9] = _mm_sub_epi32(step2[8], step2[9]); - step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] - step1[11] = _mm_add_epi32(step2[10], step2[11]); - step1[12] = _mm_add_epi32(step2[13], step2[12]); - step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] + step1[10] = _mm_sub_epi32(step2[11], step2[10]); + step1[11] = _mm_add_epi32(step2[11], step2[10]); + step1[12] = _mm_add_epi32(step2[12], step2[13]); + step1[13] = _mm_sub_epi32(step2[12], step2[13]); step1[14] = _mm_sub_epi32(step2[15], step2[14]); step1[15] = _mm_add_epi32(step2[15], step2[14]); @@ -162,8 +138,8 @@ static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) { &step2[2], &step2[3]); highbd_butterfly_sse4_1(step1[14], step1[9], (int)cospi_24_64, (int)cospi_8_64, &step2[9], &step2[14]); - highbd_butterfly_sse4_1(step1[10], step1[13], (int)cospi_8_64, - (int)cospi_24_64, &step2[13], &step2[10]); + highbd_butterfly_sse4_1(step1[10], step1[13], -(int)cospi_8_64, + -(int)cospi_24_64, &step2[13], &step2[10]); step2[5] = _mm_sub_epi32(step1[4], step1[5]); step1[4] = _mm_add_epi32(step1[4], step1[5]); step2[6] = _mm_sub_epi32(step1[7], step1[6]); @@ -193,12 +169,10 @@ static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) { &step1[4], &step1[7]); step1[8] = step2[8]; step1[9] = step2[8]; - step1[10] = - _mm_sub_epi32(_mm_setzero_si128(), step2[11]); // step1[10] = -step1[10] + step1[10] = step2[11]; step1[11] = step2[11]; step1[12] = step2[12]; - step1[13] = - _mm_sub_epi32(_mm_setzero_si128(), step2[12]); // step1[13] = -step1[13] + step1[13] = step2[12]; step1[14] = step2[15]; step1[15] = step2[15]; @@ -210,8 +184,8 @@ static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) { step2[3] = _mm_setzero_si128(); highbd_butterfly_sse4_1(step1[14], step1[9], (int)cospi_24_64, (int)cospi_8_64, &step2[9], &step2[14]); - highbd_butterfly_sse4_1(step1[10], step1[13], (int)cospi_8_64, - (int)cospi_24_64, &step2[13], &step2[10]); + highbd_butterfly_sse4_1(step1[10], step1[13], -(int)cospi_8_64, + -(int)cospi_24_64, &step2[13], &step2[10]); step2[5] = step1[4]; step2[6] = step1[7]; step2[8] = step1[8]; diff --git a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c index 88cc40ac5..417c85a04 100644 --- a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c @@ -75,17 +75,12 @@ static INLINE void highbd_idct4_small_sse2(__m128i *const io) { } static INLINE void highbd_idct4_large_sse2(__m128i *const io) { - __m128i temp[2], sign[2], step[4]; + __m128i step[4]; transpose_32bit_4x4(io, io); // stage 1 - temp[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] - abs_extend_64bit_sse2(temp[0], temp, sign); - step[0] = multiplication_round_shift_sse2(temp, sign, (int)cospi_16_64); - temp[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] - abs_extend_64bit_sse2(temp[0], temp, sign); - step[1] = multiplication_round_shift_sse2(temp, sign, (int)cospi_16_64); + highbd_butterfly_cospi16_sse2(io[0], io[2], &step[0], &step[1]); highbd_butterfly_sse2(io[1], io[3], (int)cospi_24_64, (int)cospi_8_64, &step[2], &step[3]); diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c index ac76f5c0e..ff7334199 100644 --- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c @@ -16,7 +16,7 @@ #include "vpx_dsp/x86/transpose_sse2.h" static void highbd_idct8x8_half1d(__m128i *const io) { - __m128i temp1[4], temp2[4], sign[2], step1[8], step2[8]; + __m128i step1[8], step2[8]; transpose_32bit_4x4x2(io, io); @@ -31,12 +31,7 @@ static void highbd_idct8x8_half1d(__m128i *const io) { &step1[5], &step1[6]); // stage 2 - temp2[0] = _mm_add_epi32(step1[0], step1[2]); - abs_extend_64bit_sse2(temp2[0], temp1, sign); - step2[0] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); - temp2[0] = _mm_sub_epi32(step1[0], step1[2]); - abs_extend_64bit_sse2(temp2[0], temp1, sign); - step2[1] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); + highbd_butterfly_cospi16_sse2(step1[0], step1[2], &step2[0], &step2[1]); highbd_butterfly_sse2(step1[1], step1[3], (int)cospi_24_64, (int)cospi_8_64, &step2[2], &step2[3]); step2[4] = _mm_add_epi32(step1[4], step1[5]); @@ -50,12 +45,7 @@ static void highbd_idct8x8_half1d(__m128i *const io) { step1[2] = _mm_sub_epi32(step2[1], step2[2]); step1[3] = _mm_sub_epi32(step2[0], step2[3]); step1[4] = step2[4]; - temp2[0] = _mm_sub_epi32(step2[6], step2[5]); - abs_extend_64bit_sse2(temp2[0], temp1, sign); - step1[5] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); - temp2[0] = _mm_add_epi32(step2[6], step2[5]); - abs_extend_64bit_sse2(temp2[0], temp1, sign); - step1[6] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); + highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]); step1[7] = step2[7]; // stage 4 @@ -63,7 +53,7 @@ static void highbd_idct8x8_half1d(__m128i *const io) { } static void highbd_idct8x8_12_half1d(__m128i *const io) { - __m128i temp1[4], temp2[4], sign[2], step1[8], step2[8]; + __m128i temp1[4], sign[2], step1[8], step2[8]; transpose_32bit_4x4(io, io); @@ -94,12 +84,7 @@ static void highbd_idct8x8_12_half1d(__m128i *const io) { step1[2] = _mm_sub_epi32(step2[0], step2[2]); step1[3] = _mm_sub_epi32(step2[0], step2[3]); step1[4] = step2[4]; - temp2[0] = _mm_sub_epi32(step2[6], step2[5]); - abs_extend_64bit_sse2(temp2[0], temp1, sign); - step1[5] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); - temp2[0] = _mm_add_epi32(step2[6], step2[5]); - abs_extend_64bit_sse2(temp2[0], temp1, sign); - step1[6] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); + highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]); step1[7] = step2[7]; // stage 4 diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c index e901760cc..1e9bde0e9 100644 --- a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c +++ b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c @@ -18,7 +18,7 @@ #include "vpx_dsp/x86/transpose_sse2.h" static void highbd_idct8x8_half1d(__m128i *const io) { - __m128i temp1[4], temp2[4], step1[8], step2[8]; + __m128i step1[8], step2[8]; transpose_32bit_4x4x2(io, io); @@ -33,12 +33,7 @@ static void highbd_idct8x8_half1d(__m128i *const io) { &step1[5], &step1[6]); // stage 2 - temp2[0] = _mm_add_epi32(step1[0], step1[2]); - extend_64bit(temp2[0], temp1); - step2[0] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); - temp2[0] = _mm_sub_epi32(step1[0], step1[2]); - extend_64bit(temp2[0], temp1); - step2[1] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); + highbd_butterfly_cospi16_sse4_1(step1[0], step1[2], &step2[0], &step2[1]); highbd_butterfly_sse4_1(step1[1], step1[3], (int)cospi_24_64, (int)cospi_8_64, &step2[2], &step2[3]); step2[4] = _mm_add_epi32(step1[4], step1[5]); @@ -52,12 +47,7 @@ static void highbd_idct8x8_half1d(__m128i *const io) { step1[2] = _mm_sub_epi32(step2[1], step2[2]); step1[3] = _mm_sub_epi32(step2[0], step2[3]); step1[4] = step2[4]; - temp2[0] = _mm_sub_epi32(step2[6], step2[5]); - extend_64bit(temp2[0], temp1); - step1[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); - temp2[0] = _mm_add_epi32(step2[6], step2[5]); - extend_64bit(temp2[0], temp1); - step1[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); + highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]); step1[7] = step2[7]; // stage 4 @@ -65,7 +55,7 @@ static void highbd_idct8x8_half1d(__m128i *const io) { } static void highbd_idct8x8_12_half1d(__m128i *const io) { - __m128i temp1[4], temp2[4], step1[8], step2[8]; + __m128i temp1[2], step1[8], step2[8]; transpose_32bit_4x4(io, io); @@ -96,12 +86,7 @@ static void highbd_idct8x8_12_half1d(__m128i *const io) { step1[2] = _mm_sub_epi32(step2[0], step2[2]); step1[3] = _mm_sub_epi32(step2[0], step2[3]); step1[4] = step2[4]; - temp2[0] = _mm_sub_epi32(step2[6], step2[5]); - extend_64bit(temp2[0], temp1); - step1[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); - temp2[0] = _mm_add_epi32(step2[6], step2[5]); - extend_64bit(temp2[0], temp1); - step1[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); + highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]); step1[7] = step2[7]; // stage 4 diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/vpx_dsp/x86/highbd_inv_txfm_sse2.h index d7f7a165e..81187e0fd 100644 --- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h +++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h @@ -140,6 +140,20 @@ static INLINE void highbd_butterfly_sse2(const __m128i in0, const __m128i in1, *out1 = pack_4(temp2[0], temp2[1]); } +static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0, + const __m128i in1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp1[2], temp2, sign[2]; + + temp2 = _mm_add_epi32(in0, in1); + abs_extend_64bit_sse2(temp2, temp1, sign); + *out0 = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); + temp2 = _mm_sub_epi32(in0, in1); + abs_extend_64bit_sse2(temp2, temp1, sign); + *out1 = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); +} + // Note: c0 and c1 must be non negative. static INLINE void highbd_multiplication_sse2(const __m128i in, const int c0, const int c1, __m128i *const out0, diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/vpx_dsp/x86/highbd_inv_txfm_sse4.h index 24b3f1046..66af7c699 100644 --- a/vpx_dsp/x86/highbd_inv_txfm_sse4.h +++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h @@ -59,6 +59,20 @@ static INLINE void highbd_butterfly_sse4_1(const __m128i in0, const __m128i in1, *out1 = pack_4(temp2[0], temp2[1]); } +static INLINE void highbd_butterfly_cospi16_sse4_1(const __m128i in0, + const __m128i in1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp1[2], temp2; + + temp2 = _mm_add_epi32(in0, in1); + extend_64bit(temp2, temp1); + *out0 = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); + temp2 = _mm_sub_epi32(in0, in1); + extend_64bit(temp2, temp1); + *out1 = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); +} + static INLINE void highbd_multiplication_sse4_1(const __m128i in, const int c0, const int c1, __m128i *const out0,