Merge "Clean highbd idct x86 code with inline functions"
This commit is contained in:
commit
15193ce51f
@ -18,18 +18,12 @@
|
||||
|
||||
static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
|
||||
__m128i *const out) {
|
||||
__m128i temp1[2], temp2, sign[2];
|
||||
// stage 5
|
||||
out[0] = _mm_add_epi32(in[0], in[3]);
|
||||
out[1] = _mm_add_epi32(in[1], in[2]);
|
||||
out[2] = _mm_sub_epi32(in[1], in[2]);
|
||||
out[3] = _mm_sub_epi32(in[0], in[3]);
|
||||
temp2 = _mm_sub_epi32(in[6], in[5]);
|
||||
abs_extend_64bit_sse2(temp2, temp1, sign);
|
||||
out[5] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||
temp2 = _mm_add_epi32(in[6], in[5]);
|
||||
abs_extend_64bit_sse2(temp2, temp1, sign);
|
||||
out[6] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||
highbd_butterfly_cospi16_sse2(in[6], in[5], &out[6], &out[5]);
|
||||
out[8] = _mm_add_epi32(in[8], in[11]);
|
||||
out[9] = _mm_add_epi32(in[9], in[10]);
|
||||
out[10] = _mm_sub_epi32(in[9], in[10]);
|
||||
@ -42,7 +36,6 @@ static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
|
||||
|
||||
static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
|
||||
__m128i *const out) {
|
||||
__m128i temp1[2], temp2, sign[2];
|
||||
out[0] = _mm_add_epi32(in[0], in[7]);
|
||||
out[1] = _mm_add_epi32(in[1], in[6]);
|
||||
out[2] = _mm_add_epi32(in[2], in[5]);
|
||||
@ -53,26 +46,14 @@ static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
|
||||
out[7] = _mm_sub_epi32(in[0], in[7]);
|
||||
out[8] = in[8];
|
||||
out[9] = in[9];
|
||||
temp2 = _mm_sub_epi32(in[13], in[10]);
|
||||
abs_extend_64bit_sse2(temp2, temp1, sign);
|
||||
out[10] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||
temp2 = _mm_add_epi32(in[13], in[10]);
|
||||
abs_extend_64bit_sse2(temp2, temp1, sign);
|
||||
out[13] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||
|
||||
temp2 = _mm_sub_epi32(in[12], in[11]);
|
||||
abs_extend_64bit_sse2(temp2, temp1, sign);
|
||||
out[11] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||
temp2 = _mm_add_epi32(in[12], in[11]);
|
||||
abs_extend_64bit_sse2(temp2, temp1, sign);
|
||||
out[12] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||
highbd_butterfly_cospi16_sse2(in[13], in[10], &out[13], &out[10]);
|
||||
highbd_butterfly_cospi16_sse2(in[12], in[11], &out[12], &out[11]);
|
||||
out[14] = in[14];
|
||||
out[15] = in[15];
|
||||
}
|
||||
|
||||
static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
|
||||
__m128i step1[16], step2[16];
|
||||
__m128i temp1[4], temp2, sign[2];
|
||||
|
||||
// stage 2
|
||||
highbd_butterfly_sse2(io[1], io[15], (int)cospi_30_64, (int)cospi_2_64,
|
||||
@ -99,12 +80,7 @@ static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
|
||||
step1[15] = _mm_add_epi32(step2[15], step2[14]);
|
||||
|
||||
// stage 4
|
||||
temp2 = _mm_add_epi32(io[0], io[8]);
|
||||
abs_extend_64bit_sse2(temp2, temp1, sign);
|
||||
step2[0] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||
temp2 = _mm_sub_epi32(io[0], io[8]);
|
||||
abs_extend_64bit_sse2(temp2, temp1, sign);
|
||||
step2[1] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||
highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]);
|
||||
highbd_butterfly_sse2(io[4], io[12], (int)cospi_24_64, (int)cospi_8_64,
|
||||
&step2[2], &step2[3]);
|
||||
highbd_butterfly_sse2(step1[14], step1[9], (int)cospi_24_64, (int)cospi_8_64,
|
||||
|
@ -19,18 +19,12 @@
|
||||
|
||||
static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
|
||||
__m128i *const out) {
|
||||
__m128i temp1[2], temp2;
|
||||
// stage 5
|
||||
out[0] = _mm_add_epi32(in[0], in[3]);
|
||||
out[1] = _mm_add_epi32(in[1], in[2]);
|
||||
out[2] = _mm_sub_epi32(in[1], in[2]);
|
||||
out[3] = _mm_sub_epi32(in[0], in[3]);
|
||||
temp2 = _mm_sub_epi32(in[6], in[5]);
|
||||
extend_64bit(temp2, temp1);
|
||||
out[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
|
||||
temp2 = _mm_add_epi32(in[6], in[5]);
|
||||
extend_64bit(temp2, temp1);
|
||||
out[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
|
||||
highbd_butterfly_cospi16_sse4_1(in[6], in[5], &out[6], &out[5]);
|
||||
out[8] = _mm_add_epi32(in[8], in[11]);
|
||||
out[9] = _mm_add_epi32(in[9], in[10]);
|
||||
out[10] = _mm_sub_epi32(in[9], in[10]);
|
||||
@ -43,7 +37,6 @@ static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
|
||||
|
||||
static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
|
||||
__m128i *const out) {
|
||||
__m128i temp1[2], temp2;
|
||||
out[0] = _mm_add_epi32(in[0], in[7]);
|
||||
out[1] = _mm_add_epi32(in[1], in[6]);
|
||||
out[2] = _mm_add_epi32(in[2], in[5]);
|
||||
@ -54,26 +47,14 @@ static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
|
||||
out[7] = _mm_sub_epi32(in[0], in[7]);
|
||||
out[8] = in[8];
|
||||
out[9] = in[9];
|
||||
temp2 = _mm_sub_epi32(in[13], in[10]);
|
||||
extend_64bit(temp2, temp1);
|
||||
out[10] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
|
||||
temp2 = _mm_add_epi32(in[13], in[10]);
|
||||
extend_64bit(temp2, temp1);
|
||||
out[13] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
|
||||
|
||||
temp2 = _mm_sub_epi32(in[12], in[11]);
|
||||
extend_64bit(temp2, temp1);
|
||||
out[11] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
|
||||
temp2 = _mm_add_epi32(in[12], in[11]);
|
||||
extend_64bit(temp2, temp1);
|
||||
out[12] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
|
||||
highbd_butterfly_cospi16_sse4_1(in[13], in[10], &out[13], &out[10]);
|
||||
highbd_butterfly_cospi16_sse4_1(in[12], in[11], &out[12], &out[11]);
|
||||
out[14] = in[14];
|
||||
out[15] = in[15];
|
||||
}
|
||||
|
||||
static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
|
||||
__m128i step1[16], step2[16];
|
||||
__m128i temp1[4], temp2;
|
||||
|
||||
// stage 2
|
||||
highbd_butterfly_sse4_1(io[1], io[15], (int)cospi_30_64, (int)cospi_2_64,
|
||||
@ -92,26 +73,21 @@ static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
|
||||
&step1[5], &step1[6]);
|
||||
step1[8] = _mm_add_epi32(step2[8], step2[9]);
|
||||
step1[9] = _mm_sub_epi32(step2[8], step2[9]);
|
||||
step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
|
||||
step1[11] = _mm_add_epi32(step2[10], step2[11]);
|
||||
step1[12] = _mm_add_epi32(step2[13], step2[12]);
|
||||
step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
|
||||
step1[10] = _mm_sub_epi32(step2[11], step2[10]);
|
||||
step1[11] = _mm_add_epi32(step2[11], step2[10]);
|
||||
step1[12] = _mm_add_epi32(step2[12], step2[13]);
|
||||
step1[13] = _mm_sub_epi32(step2[12], step2[13]);
|
||||
step1[14] = _mm_sub_epi32(step2[15], step2[14]);
|
||||
step1[15] = _mm_add_epi32(step2[15], step2[14]);
|
||||
|
||||
// stage 4
|
||||
temp2 = _mm_add_epi32(io[0], io[8]);
|
||||
extend_64bit(temp2, temp1);
|
||||
step2[0] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
|
||||
temp2 = _mm_sub_epi32(io[0], io[8]);
|
||||
extend_64bit(temp2, temp1);
|
||||
step2[1] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
|
||||
highbd_butterfly_cospi16_sse4_1(io[0], io[8], &step2[0], &step2[1]);
|
||||
highbd_butterfly_sse4_1(io[4], io[12], (int)cospi_24_64, (int)cospi_8_64,
|
||||
&step2[2], &step2[3]);
|
||||
highbd_butterfly_sse4_1(step1[14], step1[9], (int)cospi_24_64,
|
||||
(int)cospi_8_64, &step2[9], &step2[14]);
|
||||
highbd_butterfly_sse4_1(step1[10], step1[13], (int)cospi_8_64,
|
||||
(int)cospi_24_64, &step2[13], &step2[10]);
|
||||
highbd_butterfly_sse4_1(step1[10], step1[13], -(int)cospi_8_64,
|
||||
-(int)cospi_24_64, &step2[13], &step2[10]);
|
||||
step2[5] = _mm_sub_epi32(step1[4], step1[5]);
|
||||
step1[4] = _mm_add_epi32(step1[4], step1[5]);
|
||||
step2[6] = _mm_sub_epi32(step1[7], step1[6]);
|
||||
@ -147,10 +123,10 @@ static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
|
||||
&step1[5], &step1[6]);
|
||||
step1[8] = _mm_add_epi32(step2[8], step2[9]);
|
||||
step1[9] = _mm_sub_epi32(step2[8], step2[9]);
|
||||
step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
|
||||
step1[11] = _mm_add_epi32(step2[10], step2[11]);
|
||||
step1[12] = _mm_add_epi32(step2[13], step2[12]);
|
||||
step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
|
||||
step1[10] = _mm_sub_epi32(step2[11], step2[10]);
|
||||
step1[11] = _mm_add_epi32(step2[11], step2[10]);
|
||||
step1[12] = _mm_add_epi32(step2[12], step2[13]);
|
||||
step1[13] = _mm_sub_epi32(step2[12], step2[13]);
|
||||
step1[14] = _mm_sub_epi32(step2[15], step2[14]);
|
||||
step1[15] = _mm_add_epi32(step2[15], step2[14]);
|
||||
|
||||
@ -162,8 +138,8 @@ static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
|
||||
&step2[2], &step2[3]);
|
||||
highbd_butterfly_sse4_1(step1[14], step1[9], (int)cospi_24_64,
|
||||
(int)cospi_8_64, &step2[9], &step2[14]);
|
||||
highbd_butterfly_sse4_1(step1[10], step1[13], (int)cospi_8_64,
|
||||
(int)cospi_24_64, &step2[13], &step2[10]);
|
||||
highbd_butterfly_sse4_1(step1[10], step1[13], -(int)cospi_8_64,
|
||||
-(int)cospi_24_64, &step2[13], &step2[10]);
|
||||
step2[5] = _mm_sub_epi32(step1[4], step1[5]);
|
||||
step1[4] = _mm_add_epi32(step1[4], step1[5]);
|
||||
step2[6] = _mm_sub_epi32(step1[7], step1[6]);
|
||||
@ -193,12 +169,10 @@ static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
|
||||
&step1[4], &step1[7]);
|
||||
step1[8] = step2[8];
|
||||
step1[9] = step2[8];
|
||||
step1[10] =
|
||||
_mm_sub_epi32(_mm_setzero_si128(), step2[11]); // step1[10] = -step1[10]
|
||||
step1[10] = step2[11];
|
||||
step1[11] = step2[11];
|
||||
step1[12] = step2[12];
|
||||
step1[13] =
|
||||
_mm_sub_epi32(_mm_setzero_si128(), step2[12]); // step1[13] = -step1[13]
|
||||
step1[13] = step2[12];
|
||||
step1[14] = step2[15];
|
||||
step1[15] = step2[15];
|
||||
|
||||
@ -210,8 +184,8 @@ static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
|
||||
step2[3] = _mm_setzero_si128();
|
||||
highbd_butterfly_sse4_1(step1[14], step1[9], (int)cospi_24_64,
|
||||
(int)cospi_8_64, &step2[9], &step2[14]);
|
||||
highbd_butterfly_sse4_1(step1[10], step1[13], (int)cospi_8_64,
|
||||
(int)cospi_24_64, &step2[13], &step2[10]);
|
||||
highbd_butterfly_sse4_1(step1[10], step1[13], -(int)cospi_8_64,
|
||||
-(int)cospi_24_64, &step2[13], &step2[10]);
|
||||
step2[5] = step1[4];
|
||||
step2[6] = step1[7];
|
||||
step2[8] = step1[8];
|
||||
|
@ -75,17 +75,12 @@ static INLINE void highbd_idct4_small_sse2(__m128i *const io) {
|
||||
}
|
||||
|
||||
static INLINE void highbd_idct4_large_sse2(__m128i *const io) {
|
||||
__m128i temp[2], sign[2], step[4];
|
||||
__m128i step[4];
|
||||
|
||||
transpose_32bit_4x4(io, io);
|
||||
|
||||
// stage 1
|
||||
temp[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2]
|
||||
abs_extend_64bit_sse2(temp[0], temp, sign);
|
||||
step[0] = multiplication_round_shift_sse2(temp, sign, (int)cospi_16_64);
|
||||
temp[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2]
|
||||
abs_extend_64bit_sse2(temp[0], temp, sign);
|
||||
step[1] = multiplication_round_shift_sse2(temp, sign, (int)cospi_16_64);
|
||||
highbd_butterfly_cospi16_sse2(io[0], io[2], &step[0], &step[1]);
|
||||
highbd_butterfly_sse2(io[1], io[3], (int)cospi_24_64, (int)cospi_8_64,
|
||||
&step[2], &step[3]);
|
||||
|
||||
|
@ -16,7 +16,7 @@
|
||||
#include "vpx_dsp/x86/transpose_sse2.h"
|
||||
|
||||
static void highbd_idct8x8_half1d(__m128i *const io) {
|
||||
__m128i temp1[4], temp2[4], sign[2], step1[8], step2[8];
|
||||
__m128i step1[8], step2[8];
|
||||
|
||||
transpose_32bit_4x4x2(io, io);
|
||||
|
||||
@ -31,12 +31,7 @@ static void highbd_idct8x8_half1d(__m128i *const io) {
|
||||
&step1[5], &step1[6]);
|
||||
|
||||
// stage 2
|
||||
temp2[0] = _mm_add_epi32(step1[0], step1[2]);
|
||||
abs_extend_64bit_sse2(temp2[0], temp1, sign);
|
||||
step2[0] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||
temp2[0] = _mm_sub_epi32(step1[0], step1[2]);
|
||||
abs_extend_64bit_sse2(temp2[0], temp1, sign);
|
||||
step2[1] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||
highbd_butterfly_cospi16_sse2(step1[0], step1[2], &step2[0], &step2[1]);
|
||||
highbd_butterfly_sse2(step1[1], step1[3], (int)cospi_24_64, (int)cospi_8_64,
|
||||
&step2[2], &step2[3]);
|
||||
step2[4] = _mm_add_epi32(step1[4], step1[5]);
|
||||
@ -50,12 +45,7 @@ static void highbd_idct8x8_half1d(__m128i *const io) {
|
||||
step1[2] = _mm_sub_epi32(step2[1], step2[2]);
|
||||
step1[3] = _mm_sub_epi32(step2[0], step2[3]);
|
||||
step1[4] = step2[4];
|
||||
temp2[0] = _mm_sub_epi32(step2[6], step2[5]);
|
||||
abs_extend_64bit_sse2(temp2[0], temp1, sign);
|
||||
step1[5] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||
temp2[0] = _mm_add_epi32(step2[6], step2[5]);
|
||||
abs_extend_64bit_sse2(temp2[0], temp1, sign);
|
||||
step1[6] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||
highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
|
||||
step1[7] = step2[7];
|
||||
|
||||
// stage 4
|
||||
@ -63,7 +53,7 @@ static void highbd_idct8x8_half1d(__m128i *const io) {
|
||||
}
|
||||
|
||||
static void highbd_idct8x8_12_half1d(__m128i *const io) {
|
||||
__m128i temp1[4], temp2[4], sign[2], step1[8], step2[8];
|
||||
__m128i temp1[4], sign[2], step1[8], step2[8];
|
||||
|
||||
transpose_32bit_4x4(io, io);
|
||||
|
||||
@ -94,12 +84,7 @@ static void highbd_idct8x8_12_half1d(__m128i *const io) {
|
||||
step1[2] = _mm_sub_epi32(step2[0], step2[2]);
|
||||
step1[3] = _mm_sub_epi32(step2[0], step2[3]);
|
||||
step1[4] = step2[4];
|
||||
temp2[0] = _mm_sub_epi32(step2[6], step2[5]);
|
||||
abs_extend_64bit_sse2(temp2[0], temp1, sign);
|
||||
step1[5] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||
temp2[0] = _mm_add_epi32(step2[6], step2[5]);
|
||||
abs_extend_64bit_sse2(temp2[0], temp1, sign);
|
||||
step1[6] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||
highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
|
||||
step1[7] = step2[7];
|
||||
|
||||
// stage 4
|
||||
|
@ -18,7 +18,7 @@
|
||||
#include "vpx_dsp/x86/transpose_sse2.h"
|
||||
|
||||
static void highbd_idct8x8_half1d(__m128i *const io) {
|
||||
__m128i temp1[4], temp2[4], step1[8], step2[8];
|
||||
__m128i step1[8], step2[8];
|
||||
|
||||
transpose_32bit_4x4x2(io, io);
|
||||
|
||||
@ -33,12 +33,7 @@ static void highbd_idct8x8_half1d(__m128i *const io) {
|
||||
&step1[5], &step1[6]);
|
||||
|
||||
// stage 2
|
||||
temp2[0] = _mm_add_epi32(step1[0], step1[2]);
|
||||
extend_64bit(temp2[0], temp1);
|
||||
step2[0] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
|
||||
temp2[0] = _mm_sub_epi32(step1[0], step1[2]);
|
||||
extend_64bit(temp2[0], temp1);
|
||||
step2[1] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
|
||||
highbd_butterfly_cospi16_sse4_1(step1[0], step1[2], &step2[0], &step2[1]);
|
||||
highbd_butterfly_sse4_1(step1[1], step1[3], (int)cospi_24_64, (int)cospi_8_64,
|
||||
&step2[2], &step2[3]);
|
||||
step2[4] = _mm_add_epi32(step1[4], step1[5]);
|
||||
@ -52,12 +47,7 @@ static void highbd_idct8x8_half1d(__m128i *const io) {
|
||||
step1[2] = _mm_sub_epi32(step2[1], step2[2]);
|
||||
step1[3] = _mm_sub_epi32(step2[0], step2[3]);
|
||||
step1[4] = step2[4];
|
||||
temp2[0] = _mm_sub_epi32(step2[6], step2[5]);
|
||||
extend_64bit(temp2[0], temp1);
|
||||
step1[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
|
||||
temp2[0] = _mm_add_epi32(step2[6], step2[5]);
|
||||
extend_64bit(temp2[0], temp1);
|
||||
step1[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
|
||||
highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]);
|
||||
step1[7] = step2[7];
|
||||
|
||||
// stage 4
|
||||
@ -65,7 +55,7 @@ static void highbd_idct8x8_half1d(__m128i *const io) {
|
||||
}
|
||||
|
||||
static void highbd_idct8x8_12_half1d(__m128i *const io) {
|
||||
__m128i temp1[4], temp2[4], step1[8], step2[8];
|
||||
__m128i temp1[2], step1[8], step2[8];
|
||||
|
||||
transpose_32bit_4x4(io, io);
|
||||
|
||||
@ -96,12 +86,7 @@ static void highbd_idct8x8_12_half1d(__m128i *const io) {
|
||||
step1[2] = _mm_sub_epi32(step2[0], step2[2]);
|
||||
step1[3] = _mm_sub_epi32(step2[0], step2[3]);
|
||||
step1[4] = step2[4];
|
||||
temp2[0] = _mm_sub_epi32(step2[6], step2[5]);
|
||||
extend_64bit(temp2[0], temp1);
|
||||
step1[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
|
||||
temp2[0] = _mm_add_epi32(step2[6], step2[5]);
|
||||
extend_64bit(temp2[0], temp1);
|
||||
step1[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
|
||||
highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]);
|
||||
step1[7] = step2[7];
|
||||
|
||||
// stage 4
|
||||
|
@ -140,6 +140,20 @@ static INLINE void highbd_butterfly_sse2(const __m128i in0, const __m128i in1,
|
||||
*out1 = pack_4(temp2[0], temp2[1]);
|
||||
}
|
||||
|
||||
static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0,
|
||||
const __m128i in1,
|
||||
__m128i *const out0,
|
||||
__m128i *const out1) {
|
||||
__m128i temp1[2], temp2, sign[2];
|
||||
|
||||
temp2 = _mm_add_epi32(in0, in1);
|
||||
abs_extend_64bit_sse2(temp2, temp1, sign);
|
||||
*out0 = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||
temp2 = _mm_sub_epi32(in0, in1);
|
||||
abs_extend_64bit_sse2(temp2, temp1, sign);
|
||||
*out1 = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||
}
|
||||
|
||||
// Note: c0 and c1 must be non negative.
|
||||
static INLINE void highbd_multiplication_sse2(const __m128i in, const int c0,
|
||||
const int c1, __m128i *const out0,
|
||||
|
@ -59,6 +59,20 @@ static INLINE void highbd_butterfly_sse4_1(const __m128i in0, const __m128i in1,
|
||||
*out1 = pack_4(temp2[0], temp2[1]);
|
||||
}
|
||||
|
||||
static INLINE void highbd_butterfly_cospi16_sse4_1(const __m128i in0,
|
||||
const __m128i in1,
|
||||
__m128i *const out0,
|
||||
__m128i *const out1) {
|
||||
__m128i temp1[2], temp2;
|
||||
|
||||
temp2 = _mm_add_epi32(in0, in1);
|
||||
extend_64bit(temp2, temp1);
|
||||
*out0 = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
|
||||
temp2 = _mm_sub_epi32(in0, in1);
|
||||
extend_64bit(temp2, temp1);
|
||||
*out1 = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
|
||||
}
|
||||
|
||||
static INLINE void highbd_multiplication_sse4_1(const __m128i in, const int c0,
|
||||
const int c1,
|
||||
__m128i *const out0,
|
||||
|
Loading…
Reference in New Issue
Block a user