Merge "Clean highbd idct x86 code with inline functions"

This commit is contained in:
Linfeng Zhang 2017-08-10 20:25:18 +00:00 committed by Gerrit Code Review
commit 15193ce51f
7 changed files with 64 additions and 121 deletions

View File

@ -18,18 +18,12 @@
static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
__m128i *const out) {
__m128i temp1[2], temp2, sign[2];
// stage 5
out[0] = _mm_add_epi32(in[0], in[3]);
out[1] = _mm_add_epi32(in[1], in[2]);
out[2] = _mm_sub_epi32(in[1], in[2]);
out[3] = _mm_sub_epi32(in[0], in[3]);
temp2 = _mm_sub_epi32(in[6], in[5]);
abs_extend_64bit_sse2(temp2, temp1, sign);
out[5] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
temp2 = _mm_add_epi32(in[6], in[5]);
abs_extend_64bit_sse2(temp2, temp1, sign);
out[6] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
highbd_butterfly_cospi16_sse2(in[6], in[5], &out[6], &out[5]);
out[8] = _mm_add_epi32(in[8], in[11]);
out[9] = _mm_add_epi32(in[9], in[10]);
out[10] = _mm_sub_epi32(in[9], in[10]);
@ -42,7 +36,6 @@ static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
__m128i *const out) {
__m128i temp1[2], temp2, sign[2];
out[0] = _mm_add_epi32(in[0], in[7]);
out[1] = _mm_add_epi32(in[1], in[6]);
out[2] = _mm_add_epi32(in[2], in[5]);
@ -53,26 +46,14 @@ static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
out[7] = _mm_sub_epi32(in[0], in[7]);
out[8] = in[8];
out[9] = in[9];
temp2 = _mm_sub_epi32(in[13], in[10]);
abs_extend_64bit_sse2(temp2, temp1, sign);
out[10] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
temp2 = _mm_add_epi32(in[13], in[10]);
abs_extend_64bit_sse2(temp2, temp1, sign);
out[13] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
temp2 = _mm_sub_epi32(in[12], in[11]);
abs_extend_64bit_sse2(temp2, temp1, sign);
out[11] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
temp2 = _mm_add_epi32(in[12], in[11]);
abs_extend_64bit_sse2(temp2, temp1, sign);
out[12] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
highbd_butterfly_cospi16_sse2(in[13], in[10], &out[13], &out[10]);
highbd_butterfly_cospi16_sse2(in[12], in[11], &out[12], &out[11]);
out[14] = in[14];
out[15] = in[15];
}
static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
__m128i step1[16], step2[16];
__m128i temp1[4], temp2, sign[2];
// stage 2
highbd_butterfly_sse2(io[1], io[15], (int)cospi_30_64, (int)cospi_2_64,
@ -99,12 +80,7 @@ static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
step1[15] = _mm_add_epi32(step2[15], step2[14]);
// stage 4
temp2 = _mm_add_epi32(io[0], io[8]);
abs_extend_64bit_sse2(temp2, temp1, sign);
step2[0] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
temp2 = _mm_sub_epi32(io[0], io[8]);
abs_extend_64bit_sse2(temp2, temp1, sign);
step2[1] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]);
highbd_butterfly_sse2(io[4], io[12], (int)cospi_24_64, (int)cospi_8_64,
&step2[2], &step2[3]);
highbd_butterfly_sse2(step1[14], step1[9], (int)cospi_24_64, (int)cospi_8_64,

View File

@ -19,18 +19,12 @@
static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
__m128i *const out) {
__m128i temp1[2], temp2;
// stage 5
out[0] = _mm_add_epi32(in[0], in[3]);
out[1] = _mm_add_epi32(in[1], in[2]);
out[2] = _mm_sub_epi32(in[1], in[2]);
out[3] = _mm_sub_epi32(in[0], in[3]);
temp2 = _mm_sub_epi32(in[6], in[5]);
extend_64bit(temp2, temp1);
out[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
temp2 = _mm_add_epi32(in[6], in[5]);
extend_64bit(temp2, temp1);
out[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
highbd_butterfly_cospi16_sse4_1(in[6], in[5], &out[6], &out[5]);
out[8] = _mm_add_epi32(in[8], in[11]);
out[9] = _mm_add_epi32(in[9], in[10]);
out[10] = _mm_sub_epi32(in[9], in[10]);
@ -43,7 +37,6 @@ static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
__m128i *const out) {
__m128i temp1[2], temp2;
out[0] = _mm_add_epi32(in[0], in[7]);
out[1] = _mm_add_epi32(in[1], in[6]);
out[2] = _mm_add_epi32(in[2], in[5]);
@ -54,26 +47,14 @@ static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
out[7] = _mm_sub_epi32(in[0], in[7]);
out[8] = in[8];
out[9] = in[9];
temp2 = _mm_sub_epi32(in[13], in[10]);
extend_64bit(temp2, temp1);
out[10] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
temp2 = _mm_add_epi32(in[13], in[10]);
extend_64bit(temp2, temp1);
out[13] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
temp2 = _mm_sub_epi32(in[12], in[11]);
extend_64bit(temp2, temp1);
out[11] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
temp2 = _mm_add_epi32(in[12], in[11]);
extend_64bit(temp2, temp1);
out[12] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
highbd_butterfly_cospi16_sse4_1(in[13], in[10], &out[13], &out[10]);
highbd_butterfly_cospi16_sse4_1(in[12], in[11], &out[12], &out[11]);
out[14] = in[14];
out[15] = in[15];
}
static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
__m128i step1[16], step2[16];
__m128i temp1[4], temp2;
// stage 2
highbd_butterfly_sse4_1(io[1], io[15], (int)cospi_30_64, (int)cospi_2_64,
@ -92,26 +73,21 @@ static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
&step1[5], &step1[6]);
step1[8] = _mm_add_epi32(step2[8], step2[9]);
step1[9] = _mm_sub_epi32(step2[8], step2[9]);
step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
step1[11] = _mm_add_epi32(step2[10], step2[11]);
step1[12] = _mm_add_epi32(step2[13], step2[12]);
step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
step1[10] = _mm_sub_epi32(step2[11], step2[10]);
step1[11] = _mm_add_epi32(step2[11], step2[10]);
step1[12] = _mm_add_epi32(step2[12], step2[13]);
step1[13] = _mm_sub_epi32(step2[12], step2[13]);
step1[14] = _mm_sub_epi32(step2[15], step2[14]);
step1[15] = _mm_add_epi32(step2[15], step2[14]);
// stage 4
temp2 = _mm_add_epi32(io[0], io[8]);
extend_64bit(temp2, temp1);
step2[0] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
temp2 = _mm_sub_epi32(io[0], io[8]);
extend_64bit(temp2, temp1);
step2[1] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
highbd_butterfly_cospi16_sse4_1(io[0], io[8], &step2[0], &step2[1]);
highbd_butterfly_sse4_1(io[4], io[12], (int)cospi_24_64, (int)cospi_8_64,
&step2[2], &step2[3]);
highbd_butterfly_sse4_1(step1[14], step1[9], (int)cospi_24_64,
(int)cospi_8_64, &step2[9], &step2[14]);
highbd_butterfly_sse4_1(step1[10], step1[13], (int)cospi_8_64,
(int)cospi_24_64, &step2[13], &step2[10]);
highbd_butterfly_sse4_1(step1[10], step1[13], -(int)cospi_8_64,
-(int)cospi_24_64, &step2[13], &step2[10]);
step2[5] = _mm_sub_epi32(step1[4], step1[5]);
step1[4] = _mm_add_epi32(step1[4], step1[5]);
step2[6] = _mm_sub_epi32(step1[7], step1[6]);
@ -147,10 +123,10 @@ static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
&step1[5], &step1[6]);
step1[8] = _mm_add_epi32(step2[8], step2[9]);
step1[9] = _mm_sub_epi32(step2[8], step2[9]);
step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
step1[11] = _mm_add_epi32(step2[10], step2[11]);
step1[12] = _mm_add_epi32(step2[13], step2[12]);
step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
step1[10] = _mm_sub_epi32(step2[11], step2[10]);
step1[11] = _mm_add_epi32(step2[11], step2[10]);
step1[12] = _mm_add_epi32(step2[12], step2[13]);
step1[13] = _mm_sub_epi32(step2[12], step2[13]);
step1[14] = _mm_sub_epi32(step2[15], step2[14]);
step1[15] = _mm_add_epi32(step2[15], step2[14]);
@ -162,8 +138,8 @@ static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
&step2[2], &step2[3]);
highbd_butterfly_sse4_1(step1[14], step1[9], (int)cospi_24_64,
(int)cospi_8_64, &step2[9], &step2[14]);
highbd_butterfly_sse4_1(step1[10], step1[13], (int)cospi_8_64,
(int)cospi_24_64, &step2[13], &step2[10]);
highbd_butterfly_sse4_1(step1[10], step1[13], -(int)cospi_8_64,
-(int)cospi_24_64, &step2[13], &step2[10]);
step2[5] = _mm_sub_epi32(step1[4], step1[5]);
step1[4] = _mm_add_epi32(step1[4], step1[5]);
step2[6] = _mm_sub_epi32(step1[7], step1[6]);
@ -193,12 +169,10 @@ static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
&step1[4], &step1[7]);
step1[8] = step2[8];
step1[9] = step2[8];
step1[10] =
_mm_sub_epi32(_mm_setzero_si128(), step2[11]); // step1[10] = -step1[10]
step1[10] = step2[11];
step1[11] = step2[11];
step1[12] = step2[12];
step1[13] =
_mm_sub_epi32(_mm_setzero_si128(), step2[12]); // step1[13] = -step1[13]
step1[13] = step2[12];
step1[14] = step2[15];
step1[15] = step2[15];
@ -210,8 +184,8 @@ static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
step2[3] = _mm_setzero_si128();
highbd_butterfly_sse4_1(step1[14], step1[9], (int)cospi_24_64,
(int)cospi_8_64, &step2[9], &step2[14]);
highbd_butterfly_sse4_1(step1[10], step1[13], (int)cospi_8_64,
(int)cospi_24_64, &step2[13], &step2[10]);
highbd_butterfly_sse4_1(step1[10], step1[13], -(int)cospi_8_64,
-(int)cospi_24_64, &step2[13], &step2[10]);
step2[5] = step1[4];
step2[6] = step1[7];
step2[8] = step1[8];

View File

@ -75,17 +75,12 @@ static INLINE void highbd_idct4_small_sse2(__m128i *const io) {
}
static INLINE void highbd_idct4_large_sse2(__m128i *const io) {
__m128i temp[2], sign[2], step[4];
__m128i step[4];
transpose_32bit_4x4(io, io);
// stage 1
temp[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2]
abs_extend_64bit_sse2(temp[0], temp, sign);
step[0] = multiplication_round_shift_sse2(temp, sign, (int)cospi_16_64);
temp[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2]
abs_extend_64bit_sse2(temp[0], temp, sign);
step[1] = multiplication_round_shift_sse2(temp, sign, (int)cospi_16_64);
highbd_butterfly_cospi16_sse2(io[0], io[2], &step[0], &step[1]);
highbd_butterfly_sse2(io[1], io[3], (int)cospi_24_64, (int)cospi_8_64,
&step[2], &step[3]);

View File

@ -16,7 +16,7 @@
#include "vpx_dsp/x86/transpose_sse2.h"
static void highbd_idct8x8_half1d(__m128i *const io) {
__m128i temp1[4], temp2[4], sign[2], step1[8], step2[8];
__m128i step1[8], step2[8];
transpose_32bit_4x4x2(io, io);
@ -31,12 +31,7 @@ static void highbd_idct8x8_half1d(__m128i *const io) {
&step1[5], &step1[6]);
// stage 2
temp2[0] = _mm_add_epi32(step1[0], step1[2]);
abs_extend_64bit_sse2(temp2[0], temp1, sign);
step2[0] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
temp2[0] = _mm_sub_epi32(step1[0], step1[2]);
abs_extend_64bit_sse2(temp2[0], temp1, sign);
step2[1] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
highbd_butterfly_cospi16_sse2(step1[0], step1[2], &step2[0], &step2[1]);
highbd_butterfly_sse2(step1[1], step1[3], (int)cospi_24_64, (int)cospi_8_64,
&step2[2], &step2[3]);
step2[4] = _mm_add_epi32(step1[4], step1[5]);
@ -50,12 +45,7 @@ static void highbd_idct8x8_half1d(__m128i *const io) {
step1[2] = _mm_sub_epi32(step2[1], step2[2]);
step1[3] = _mm_sub_epi32(step2[0], step2[3]);
step1[4] = step2[4];
temp2[0] = _mm_sub_epi32(step2[6], step2[5]);
abs_extend_64bit_sse2(temp2[0], temp1, sign);
step1[5] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
temp2[0] = _mm_add_epi32(step2[6], step2[5]);
abs_extend_64bit_sse2(temp2[0], temp1, sign);
step1[6] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
step1[7] = step2[7];
// stage 4
@ -63,7 +53,7 @@ static void highbd_idct8x8_half1d(__m128i *const io) {
}
static void highbd_idct8x8_12_half1d(__m128i *const io) {
__m128i temp1[4], temp2[4], sign[2], step1[8], step2[8];
__m128i temp1[4], sign[2], step1[8], step2[8];
transpose_32bit_4x4(io, io);
@ -94,12 +84,7 @@ static void highbd_idct8x8_12_half1d(__m128i *const io) {
step1[2] = _mm_sub_epi32(step2[0], step2[2]);
step1[3] = _mm_sub_epi32(step2[0], step2[3]);
step1[4] = step2[4];
temp2[0] = _mm_sub_epi32(step2[6], step2[5]);
abs_extend_64bit_sse2(temp2[0], temp1, sign);
step1[5] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
temp2[0] = _mm_add_epi32(step2[6], step2[5]);
abs_extend_64bit_sse2(temp2[0], temp1, sign);
step1[6] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
step1[7] = step2[7];
// stage 4

View File

@ -18,7 +18,7 @@
#include "vpx_dsp/x86/transpose_sse2.h"
static void highbd_idct8x8_half1d(__m128i *const io) {
__m128i temp1[4], temp2[4], step1[8], step2[8];
__m128i step1[8], step2[8];
transpose_32bit_4x4x2(io, io);
@ -33,12 +33,7 @@ static void highbd_idct8x8_half1d(__m128i *const io) {
&step1[5], &step1[6]);
// stage 2
temp2[0] = _mm_add_epi32(step1[0], step1[2]);
extend_64bit(temp2[0], temp1);
step2[0] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
temp2[0] = _mm_sub_epi32(step1[0], step1[2]);
extend_64bit(temp2[0], temp1);
step2[1] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
highbd_butterfly_cospi16_sse4_1(step1[0], step1[2], &step2[0], &step2[1]);
highbd_butterfly_sse4_1(step1[1], step1[3], (int)cospi_24_64, (int)cospi_8_64,
&step2[2], &step2[3]);
step2[4] = _mm_add_epi32(step1[4], step1[5]);
@ -52,12 +47,7 @@ static void highbd_idct8x8_half1d(__m128i *const io) {
step1[2] = _mm_sub_epi32(step2[1], step2[2]);
step1[3] = _mm_sub_epi32(step2[0], step2[3]);
step1[4] = step2[4];
temp2[0] = _mm_sub_epi32(step2[6], step2[5]);
extend_64bit(temp2[0], temp1);
step1[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
temp2[0] = _mm_add_epi32(step2[6], step2[5]);
extend_64bit(temp2[0], temp1);
step1[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]);
step1[7] = step2[7];
// stage 4
@ -65,7 +55,7 @@ static void highbd_idct8x8_half1d(__m128i *const io) {
}
static void highbd_idct8x8_12_half1d(__m128i *const io) {
__m128i temp1[4], temp2[4], step1[8], step2[8];
__m128i temp1[2], step1[8], step2[8];
transpose_32bit_4x4(io, io);
@ -96,12 +86,7 @@ static void highbd_idct8x8_12_half1d(__m128i *const io) {
step1[2] = _mm_sub_epi32(step2[0], step2[2]);
step1[3] = _mm_sub_epi32(step2[0], step2[3]);
step1[4] = step2[4];
temp2[0] = _mm_sub_epi32(step2[6], step2[5]);
extend_64bit(temp2[0], temp1);
step1[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
temp2[0] = _mm_add_epi32(step2[6], step2[5]);
extend_64bit(temp2[0], temp1);
step1[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]);
step1[7] = step2[7];
// stage 4

View File

@ -140,6 +140,20 @@ static INLINE void highbd_butterfly_sse2(const __m128i in0, const __m128i in1,
*out1 = pack_4(temp2[0], temp2[1]);
}
static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0,
const __m128i in1,
__m128i *const out0,
__m128i *const out1) {
__m128i temp1[2], temp2, sign[2];
temp2 = _mm_add_epi32(in0, in1);
abs_extend_64bit_sse2(temp2, temp1, sign);
*out0 = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
temp2 = _mm_sub_epi32(in0, in1);
abs_extend_64bit_sse2(temp2, temp1, sign);
*out1 = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
}
// Note: c0 and c1 must be non negative.
static INLINE void highbd_multiplication_sse2(const __m128i in, const int c0,
const int c1, __m128i *const out0,

View File

@ -59,6 +59,20 @@ static INLINE void highbd_butterfly_sse4_1(const __m128i in0, const __m128i in1,
*out1 = pack_4(temp2[0], temp2[1]);
}
static INLINE void highbd_butterfly_cospi16_sse4_1(const __m128i in0,
const __m128i in1,
__m128i *const out0,
__m128i *const out1) {
__m128i temp1[2], temp2;
temp2 = _mm_add_epi32(in0, in1);
extend_64bit(temp2, temp1);
*out0 = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
temp2 = _mm_sub_epi32(in0, in1);
extend_64bit(temp2, temp1);
*out1 = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
}
static INLINE void highbd_multiplication_sse4_1(const __m128i in, const int c0,
const int c1,
__m128i *const out0,