Merge "Update load_input_data() in x86"

This commit is contained in:
Linfeng Zhang 2017-06-26 21:48:49 +00:00 committed by Gerrit Code Review
commit 39972d999d
4 changed files with 91 additions and 92 deletions

View File

@ -18,8 +18,8 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
__m128i in[2];
const __m128i eight = _mm_set1_epi16(8);
in[0] = load_input_data(input);
in[1] = load_input_data(input + 8);
in[0] = load_input_data8(input);
in[1] = load_input_data8(input + 8);
switch (tx_type) {
case 0: // DCT_DCT
@ -57,14 +57,14 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
const __m128i final_rounding = _mm_set1_epi16(1 << 4);
// load input data
in[0] = load_input_data(input);
in[1] = load_input_data(input + 8 * 1);
in[2] = load_input_data(input + 8 * 2);
in[3] = load_input_data(input + 8 * 3);
in[4] = load_input_data(input + 8 * 4);
in[5] = load_input_data(input + 8 * 5);
in[6] = load_input_data(input + 8 * 6);
in[7] = load_input_data(input + 8 * 7);
in[0] = load_input_data8(input);
in[1] = load_input_data8(input + 8 * 1);
in[2] = load_input_data8(input + 8 * 2);
in[3] = load_input_data8(input + 8 * 3);
in[4] = load_input_data8(input + 8 * 4);
in[5] = load_input_data8(input + 8 * 5);
in[6] = load_input_data8(input + 8 * 6);
in[7] = load_input_data8(input + 8 * 7);
switch (tx_type) {
case 0: // DCT_DCT

View File

@ -27,8 +27,8 @@ void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
__m128i in[2];
// Rows
in[0] = load_input_data(input);
in[1] = load_input_data(input + 8);
in[0] = load_input_data8(input);
in[1] = load_input_data8(input + 8);
idct4_sse2(in);
// Columns
@ -491,10 +491,10 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
__m128i in[8], step1[8], step2[8], tmp[4];
in[0] = load_input_data(input + 0 * 8);
in[1] = load_input_data(input + 1 * 8);
in[2] = load_input_data(input + 2 * 8);
in[3] = load_input_data(input + 3 * 8);
in[0] = load_input_data4(input + 0 * 8);
in[1] = load_input_data4(input + 1 * 8);
in[2] = load_input_data4(input + 2 * 8);
in[3] = load_input_data4(input + 3 * 8);
transpose_16bit_4x4(in, in);
// in[0]: 00 10 20 30 01 11 21 31
@ -721,14 +721,14 @@ static INLINE void idct16_8col(__m128i *const in) {
static INLINE void idct16_load8x8(const tran_low_t *const input,
__m128i *const in) {
in[0] = load_input_data(input);
in[1] = load_input_data(input + 8 * 2);
in[2] = load_input_data(input + 8 * 4);
in[3] = load_input_data(input + 8 * 6);
in[4] = load_input_data(input + 8 * 8);
in[5] = load_input_data(input + 8 * 10);
in[6] = load_input_data(input + 8 * 12);
in[7] = load_input_data(input + 8 * 14);
in[0] = load_input_data8(input);
in[1] = load_input_data8(input + 8 * 2);
in[2] = load_input_data8(input + 8 * 4);
in[3] = load_input_data8(input + 8 * 6);
in[4] = load_input_data8(input + 8 * 8);
in[5] = load_input_data8(input + 8 * 10);
in[6] = load_input_data8(input + 8 * 12);
in[7] = load_input_data8(input + 8 * 14);
}
void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
@ -1258,10 +1258,10 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
int i;
// First 1-D inverse DCT
// Load input data.
in[0] = load_input_data(input);
in[1] = load_input_data(input + 8 * 2);
in[2] = load_input_data(input + 8 * 4);
in[3] = load_input_data(input + 8 * 6);
in[0] = load_input_data4(input + 0 * 16);
in[1] = load_input_data4(input + 1 * 16);
in[2] = load_input_data4(input + 2 * 16);
in[3] = load_input_data4(input + 3 * 16);
transpose_16bit_4x4(in, in);
@ -1651,14 +1651,14 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
int i;
// Load input data. Only need to load the top left 8x8 block.
in[0] = load_input_data(input);
in[1] = load_input_data(input + 32);
in[2] = load_input_data(input + 64);
in[3] = load_input_data(input + 96);
in[4] = load_input_data(input + 128);
in[5] = load_input_data(input + 160);
in[6] = load_input_data(input + 192);
in[7] = load_input_data(input + 224);
in[0] = load_input_data8(input + 0 * 32);
in[1] = load_input_data8(input + 1 * 32);
in[2] = load_input_data8(input + 2 * 32);
in[3] = load_input_data8(input + 3 * 32);
in[4] = load_input_data8(input + 4 * 32);
in[5] = load_input_data8(input + 5 * 32);
in[6] = load_input_data8(input + 6 * 32);
in[7] = load_input_data8(input + 7 * 32);
transpose_16bit_8x8(in, in);
IDCT32_34
@ -2008,10 +2008,10 @@ static void idct32_full_8x32(const __m128i *in /*in[32]*/,
static void load_buffer_8x32(const tran_low_t *input, __m128i *in) {
int i;
for (i = 0; i < 8; ++i) {
in[i] = load_input_data(input);
in[i + 8] = load_input_data(input + 8);
in[i + 16] = load_input_data(input + 16);
in[i + 24] = load_input_data(input + 24);
in[i] = load_input_data8(input);
in[i + 8] = load_input_data8(input + 8);
in[i + 16] = load_input_data8(input + 16);
in[i + 24] = load_input_data8(input + 24);
input += 32;
}
}

View File

@ -76,24 +76,23 @@ static INLINE __m128i idct_calc_wraplow_sse2(const __m128i in0,
return _mm_packs_epi32(t0, t1);
}
// Function to allow 8 bit optimisations to be used when profile 0 is used with
// Functions to allow 8 bit optimisations to be used when profile 0 is used with
// highbitdepth enabled
static INLINE __m128i load_input_data(const tran_low_t *data) {
static INLINE __m128i load_input_data4(const tran_low_t *data) {
#if CONFIG_VP9_HIGHBITDEPTH
const __m128i zero = _mm_setzero_si128();
const __m128i in = _mm_load_si128((const __m128i *)data);
return _mm_packs_epi32(in, zero);
#else
return _mm_loadl_epi64((const __m128i *)data);
#endif
}
static INLINE __m128i load_input_data8(const tran_low_t *data) {
#if CONFIG_VP9_HIGHBITDEPTH
// in0: 0 X 1 X 2 X 3 X
// in1: 4 X 5 X 6 X 7 X
// t0: 0 4 X X 1 5 X X
// t1: 2 6 X X 3 7 X X
// t2: 0 2 4 6 X X X X
// t3: 1 3 5 7 X X X X
// rtn: 0 1 2 3 4 5 6 7
const __m128i in0 = _mm_load_si128((const __m128i *)data);
const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4));
const __m128i t0 = _mm_unpacklo_epi16(in0, in1);
const __m128i t1 = _mm_unpackhi_epi16(in0, in1);
const __m128i t2 = _mm_unpacklo_epi16(t0, t1);
const __m128i t3 = _mm_unpackhi_epi16(t0, t1);
return _mm_unpacklo_epi16(t2, t3);
return _mm_packs_epi32(in0, in1);
#else
return _mm_load_si128((const __m128i *)data);
#endif
@ -101,35 +100,35 @@ static INLINE __m128i load_input_data(const tran_low_t *data) {
static INLINE void load_buffer_8x8(const tran_low_t *const input,
__m128i *const in) {
in[0] = load_input_data(input + 0 * 8);
in[1] = load_input_data(input + 1 * 8);
in[2] = load_input_data(input + 2 * 8);
in[3] = load_input_data(input + 3 * 8);
in[4] = load_input_data(input + 4 * 8);
in[5] = load_input_data(input + 5 * 8);
in[6] = load_input_data(input + 6 * 8);
in[7] = load_input_data(input + 7 * 8);
in[0] = load_input_data8(input + 0 * 8);
in[1] = load_input_data8(input + 1 * 8);
in[2] = load_input_data8(input + 2 * 8);
in[3] = load_input_data8(input + 3 * 8);
in[4] = load_input_data8(input + 4 * 8);
in[5] = load_input_data8(input + 5 * 8);
in[6] = load_input_data8(input + 6 * 8);
in[7] = load_input_data8(input + 7 * 8);
}
static INLINE void load_buffer_8x16(const tran_low_t *const input,
__m128i *const in) {
in[0] = load_input_data(input + 0 * 16);
in[1] = load_input_data(input + 1 * 16);
in[2] = load_input_data(input + 2 * 16);
in[3] = load_input_data(input + 3 * 16);
in[4] = load_input_data(input + 4 * 16);
in[5] = load_input_data(input + 5 * 16);
in[6] = load_input_data(input + 6 * 16);
in[7] = load_input_data(input + 7 * 16);
in[0] = load_input_data8(input + 0 * 16);
in[1] = load_input_data8(input + 1 * 16);
in[2] = load_input_data8(input + 2 * 16);
in[3] = load_input_data8(input + 3 * 16);
in[4] = load_input_data8(input + 4 * 16);
in[5] = load_input_data8(input + 5 * 16);
in[6] = load_input_data8(input + 6 * 16);
in[7] = load_input_data8(input + 7 * 16);
in[8] = load_input_data(input + 8 * 16);
in[9] = load_input_data(input + 9 * 16);
in[10] = load_input_data(input + 10 * 16);
in[11] = load_input_data(input + 11 * 16);
in[12] = load_input_data(input + 12 * 16);
in[13] = load_input_data(input + 13 * 16);
in[14] = load_input_data(input + 14 * 16);
in[15] = load_input_data(input + 15 * 16);
in[8] = load_input_data8(input + 8 * 16);
in[9] = load_input_data8(input + 9 * 16);
in[10] = load_input_data8(input + 10 * 16);
in[11] = load_input_data8(input + 11 * 16);
in[12] = load_input_data8(input + 12 * 16);
in[13] = load_input_data8(input + 13 * 16);
in[14] = load_input_data8(input + 14 * 16);
in[15] = load_input_data8(input + 15 * 16);
}
static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) {

View File

@ -36,10 +36,10 @@ void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
__m128i tmp[4];
// Rows. Load 4-row input data.
in[0] = load_input_data(input);
in[1] = load_input_data(input + 8 * 1);
in[2] = load_input_data(input + 8 * 2);
in[3] = load_input_data(input + 8 * 3);
in[0] = load_input_data4(input + 0 * 8);
in[1] = load_input_data4(input + 1 * 8);
in[2] = load_input_data4(input + 2 * 8);
in[3] = load_input_data4(input + 3 * 8);
// 4x4 Transpose
transpose_16bit_4x4(in, in);
@ -342,14 +342,14 @@ void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest,
int i;
// Load input data. Only need to load the top left 8x8 block.
in[0] = load_input_data(input);
in[1] = load_input_data(input + 32);
in[2] = load_input_data(input + 64);
in[3] = load_input_data(input + 96);
in[4] = load_input_data(input + 128);
in[5] = load_input_data(input + 160);
in[6] = load_input_data(input + 192);
in[7] = load_input_data(input + 224);
in[0] = load_input_data8(input + 0 * 32);
in[1] = load_input_data8(input + 1 * 32);
in[2] = load_input_data8(input + 2 * 32);
in[3] = load_input_data8(input + 3 * 32);
in[4] = load_input_data8(input + 4 * 32);
in[5] = load_input_data8(input + 5 * 32);
in[6] = load_input_data8(input + 6 * 32);
in[7] = load_input_data8(input + 7 * 32);
transpose_16bit_8x8(in, in);
idct32_34_first_half(in, stp1);
@ -383,8 +383,8 @@ static void load_buffer_16x16(const tran_low_t *input, __m128i *in0,
__m128i *in1) {
int i;
for (i = 0; i < 16; i++) {
in0[i] = load_input_data(input);
in1[i] = load_input_data(input + 8);
in0[i] = load_input_data8(input);
in1[i] = load_input_data8(input + 8);
input += 32;
}
}