Replace idct8x8_12_add_ssse3 assembly code with intrinsics

- Performance achieves the same as assembly.
- Unit tests pass.

Change-Id: I6eacfbbd826b3946c724d78fbef7948af6406ccd
This commit is contained in:
Yi Luo 2017-02-07 16:40:10 -08:00
parent 0fefc6873a
commit ac04d11abc
5 changed files with 167 additions and 141 deletions

View File

@ -779,7 +779,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_idct8x8_64_add sse2 ssse3 neon dspr2 msa/;
add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
specialize qw/vpx_idct8x8_12_add sse2 ssse3 neon dspr2 msa/;
add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct16x16_1_add sse2 neon dspr2 msa/;

View File

@ -263,32 +263,6 @@ void iadst4_sse2(__m128i *in) {
in[1] = _mm_packs_epi32(u[2], u[3]);
}
#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
{ \
const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
\
const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
\
out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
}
#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
{ \
const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
}
// Define Macro for multiplying elements by constants and adding them together.
#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
res0, res1, res2, res3) \

View File

@ -216,6 +216,32 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
RECON_AND_STORE(dest + 15 * stride, in[15]);
}
#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
{ \
const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
\
const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
\
out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
}
#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
{ \
const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
}
void idct4_sse2(__m128i *in);
void idct8_sse2(__m128i *in);
void idct16_sse2(__m128i *in0, __m128i *in1);

View File

@ -182,3 +182,143 @@ void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest,
RECON_AND_STORE(dest + 6 * stride, in6);
RECON_AND_STORE(dest + 7 * stride, in7);
}
void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i final_rounding = _mm_set1_epi16(1 << 4);
const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
__m128i in0, in1, in2, in3, in4, in5, in6, in7;
__m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
__m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
__m128i tmp0, tmp1, tmp2, tmp3;
// Rows. Load 4-row input data.
in0 = load_input_data(input);
in1 = load_input_data(input + 8 * 1);
in2 = load_input_data(input + 8 * 2);
in3 = load_input_data(input + 8 * 3);
// 8x4 Transpose
TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
// Stage1
tmp0 = _mm_mulhrs_epi16(in0, stg1_0);
tmp1 = _mm_mulhrs_epi16(in0, stg1_1);
tmp2 = _mm_mulhrs_epi16(in1, stg1_2);
tmp3 = _mm_mulhrs_epi16(in1, stg1_3);
stp1_4 = _mm_unpackhi_epi64(tmp0, tmp1);
stp1_5 = _mm_unpackhi_epi64(tmp2, tmp3);
// Stage2
tmp0 = _mm_mulhrs_epi16(in0, stg2_0);
stp2_0 = _mm_unpacklo_epi64(tmp0, tmp0);
tmp1 = _mm_mulhrs_epi16(in1, stg2_2);
tmp2 = _mm_mulhrs_epi16(in1, stg2_3);
stp2_2 = _mm_unpacklo_epi64(tmp2, tmp1);
tmp0 = _mm_add_epi16(stp1_4, stp1_5);
tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
stp2_4 = tmp0;
stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
// Stage3
tmp2 = _mm_add_epi16(stp2_0, stp2_2);
tmp3 = _mm_sub_epi16(stp2_0, stp2_2);
stp1_2 = _mm_unpackhi_epi64(tmp3, tmp2);
stp1_3 = _mm_unpacklo_epi64(tmp3, tmp2);
tmp0 = _mm_sub_epi16(stp2_6, stp2_5);
tmp1 = _mm_add_epi16(stp2_6, stp2_5);
tmp2 = _mm_mulhrs_epi16(tmp0, stg2_0);
tmp3 = _mm_mulhrs_epi16(tmp1, stg2_0);
stp1_5 = _mm_unpacklo_epi64(tmp2, tmp3);
// Stage4
tmp0 = _mm_add_epi16(stp1_3, stp2_4);
tmp1 = _mm_add_epi16(stp1_2, stp1_5);
tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
/* Stage1 */
stp1_4 = _mm_mulhrs_epi16(in1, stg1_0);
stp1_7 = _mm_mulhrs_epi16(in1, stg1_1);
stp1_5 = _mm_mulhrs_epi16(in3, stg1_2);
stp1_6 = _mm_mulhrs_epi16(in3, stg1_3);
/* Stage2 */
stp2_0 = _mm_mulhrs_epi16(in0, stg2_0);
stp2_1 = _mm_mulhrs_epi16(in0, stg2_0);
stp2_2 = _mm_mulhrs_epi16(in2, stg2_2);
stp2_3 = _mm_mulhrs_epi16(in2, stg2_3);
stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
/* Stage3 */
stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
tmp0 = _mm_add_epi16(stp2_6, stp2_5);
tmp1 = _mm_sub_epi16(stp2_6, stp2_5);
stp1_6 = _mm_mulhrs_epi16(tmp0, stg2_0);
stp1_5 = _mm_mulhrs_epi16(tmp1, stg2_0);
/* Stage4 */
in0 = _mm_add_epi16(stp1_0, stp2_7);
in1 = _mm_add_epi16(stp1_1, stp1_6);
in2 = _mm_add_epi16(stp1_2, stp1_5);
in3 = _mm_add_epi16(stp1_3, stp2_4);
in4 = _mm_sub_epi16(stp1_3, stp2_4);
in5 = _mm_sub_epi16(stp1_2, stp1_5);
in6 = _mm_sub_epi16(stp1_1, stp1_6);
in7 = _mm_sub_epi16(stp1_0, stp2_7);
// Final rounding and shift
in0 = _mm_adds_epi16(in0, final_rounding);
in1 = _mm_adds_epi16(in1, final_rounding);
in2 = _mm_adds_epi16(in2, final_rounding);
in3 = _mm_adds_epi16(in3, final_rounding);
in4 = _mm_adds_epi16(in4, final_rounding);
in5 = _mm_adds_epi16(in5, final_rounding);
in6 = _mm_adds_epi16(in6, final_rounding);
in7 = _mm_adds_epi16(in7, final_rounding);
in0 = _mm_srai_epi16(in0, 5);
in1 = _mm_srai_epi16(in1, 5);
in2 = _mm_srai_epi16(in2, 5);
in3 = _mm_srai_epi16(in3, 5);
in4 = _mm_srai_epi16(in4, 5);
in5 = _mm_srai_epi16(in5, 5);
in6 = _mm_srai_epi16(in6, 5);
in7 = _mm_srai_epi16(in7, 5);
RECON_AND_STORE(dest + 0 * stride, in0);
RECON_AND_STORE(dest + 1 * stride, in1);
RECON_AND_STORE(dest + 2 * stride, in2);
RECON_AND_STORE(dest + 3 * stride, in3);
RECON_AND_STORE(dest + 4 * stride, in4);
RECON_AND_STORE(dest + 5 * stride, in5);
RECON_AND_STORE(dest + 6 * stride, in6);
RECON_AND_STORE(dest + 7 * stride, in7);
}

View File

@ -222,120 +222,6 @@ SECTION .text
movh [outputq + strideq], m%4
%endmacro
INIT_XMM ssse3
; inverse 8x8 2D-DCT transform with only first 12 coeffs non-zero
cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
mova m8, [pd_8192]
mova m11, [pw_16]
mova m12, [pw_11585x2]
lea r3, [2 * strideq]
LOAD_TRAN_LOW 0, inputq, 0
LOAD_TRAN_LOW 1, inputq, 1
LOAD_TRAN_LOW 2, inputq, 2
LOAD_TRAN_LOW 3, inputq, 3
punpcklwd m0, m1
punpcklwd m2, m3
punpckhdq m9, m0, m2
punpckldq m0, m2
SWAP 2, 9
; m0 -> [0], [0]
; m1 -> [1], [1]
; m2 -> [2], [2]
; m3 -> [3], [3]
punpckhqdq m10, m0, m0
punpcklqdq m0, m0
punpckhqdq m9, m2, m2
punpcklqdq m2, m2
SWAP 1, 10
SWAP 3, 9
pmulhrsw m0, m12
pmulhrsw m2, [dpw_30274_12540]
pmulhrsw m1, [dpw_6392_32138]
pmulhrsw m3, [dpw_m18204_27246]
SUM_SUB 0, 2, 9
SUM_SUB 1, 3, 9
punpcklqdq m9, m3, m3
punpckhqdq m5, m3, m9
SUM_SUB 3, 5, 9
punpckhqdq m5, m3
pmulhrsw m5, m12
punpckhqdq m9, m1, m5
punpcklqdq m1, m5
SWAP 5, 9
SUM_SUB 0, 5, 9
SUM_SUB 2, 1, 9
punpckhqdq m3, m0, m0
punpckhqdq m4, m1, m1
punpckhqdq m6, m5, m5
punpckhqdq m7, m2, m2
punpcklwd m0, m3
punpcklwd m7, m2
punpcklwd m1, m4
punpcklwd m6, m5
punpckhdq m4, m0, m7
punpckldq m0, m7
punpckhdq m10, m1, m6
punpckldq m5, m1, m6
punpckhqdq m1, m0, m5
punpcklqdq m0, m5
punpckhqdq m3, m4, m10
punpcklqdq m2, m4, m10
pmulhrsw m0, m12
pmulhrsw m6, m2, [dpw_30274_30274]
pmulhrsw m4, m2, [dpw_12540_12540]
pmulhrsw m7, m1, [dpw_32138_32138]
pmulhrsw m1, [dpw_6392_6392]
pmulhrsw m5, m3, [dpw_m18204_m18204]
pmulhrsw m3, [dpw_27246_27246]
mova m2, m0
SUM_SUB 0, 6, 9
SUM_SUB 2, 4, 9
SUM_SUB 1, 5, 9
SUM_SUB 7, 3, 9
SUM_SUB 3, 5, 9
pmulhrsw m3, m12
pmulhrsw m5, m12
SUM_SUB 0, 7, 9
SUM_SUB 2, 3, 9
SUM_SUB 4, 5, 9
SUM_SUB 6, 1, 9
SWAP 3, 6
SWAP 1, 2
SWAP 2, 4
pxor m12, m12
ADD_STORE_8P_2X 0, 1, 9, 10, 12
lea outputq, [outputq + r3]
ADD_STORE_8P_2X 2, 3, 9, 10, 12
lea outputq, [outputq + r3]
ADD_STORE_8P_2X 4, 5, 9, 10, 12
lea outputq, [outputq + r3]
ADD_STORE_8P_2X 6, 7, 9, 10, 12
RET
%define idx0 16 * 0
%define idx1 16 * 1
%define idx2 16 * 2