Merge "Rewrite vpx_idct16x16_{10,256}_add_sse2() and add case 38 function"
This commit is contained in:
commit
e921c7ba8d
@ -651,6 +651,15 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = {
|
|||||||
make_tuple(
|
make_tuple(
|
||||||
&vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
|
&vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
|
||||||
&highbd_wrapper<vpx_highbd_idct16x16_256_add_sse2>, TX_16X16, 256, 12, 2),
|
&highbd_wrapper<vpx_highbd_idct16x16_256_add_sse2>, TX_16X16, 256, 12, 2),
|
||||||
|
make_tuple(
|
||||||
|
&vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
|
||||||
|
&highbd_wrapper<vpx_highbd_idct16x16_38_add_sse2>, TX_16X16, 38, 8, 2),
|
||||||
|
make_tuple(
|
||||||
|
&vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
|
||||||
|
&highbd_wrapper<vpx_highbd_idct16x16_38_add_sse2>, TX_16X16, 38, 10, 2),
|
||||||
|
make_tuple(
|
||||||
|
&vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
|
||||||
|
&highbd_wrapper<vpx_highbd_idct16x16_38_add_sse2>, TX_16X16, 38, 12, 2),
|
||||||
make_tuple(
|
make_tuple(
|
||||||
&vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
|
&vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
|
||||||
&highbd_wrapper<vpx_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 8, 2),
|
&highbd_wrapper<vpx_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 8, 2),
|
||||||
|
@ -658,7 +658,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
|||||||
specialize qw/vpx_highbd_idct8x8_12_add neon sse2 sse4_1/;
|
specialize qw/vpx_highbd_idct8x8_12_add neon sse2 sse4_1/;
|
||||||
specialize qw/vpx_highbd_idct16x16_256_add neon sse2/;
|
specialize qw/vpx_highbd_idct16x16_256_add neon sse2/;
|
||||||
specialize qw/vpx_highbd_idct16x16_38_add neon sse2/;
|
specialize qw/vpx_highbd_idct16x16_38_add neon sse2/;
|
||||||
$vpx_highbd_idct16x16_38_add_sse2=vpx_highbd_idct16x16_256_add_sse2;
|
|
||||||
specialize qw/vpx_highbd_idct16x16_10_add neon sse2/;
|
specialize qw/vpx_highbd_idct16x16_10_add neon sse2/;
|
||||||
specialize qw/vpx_highbd_idct32x32_1024_add neon/;
|
specialize qw/vpx_highbd_idct32x32_1024_add neon/;
|
||||||
specialize qw/vpx_highbd_idct32x32_135_add neon/;
|
specialize qw/vpx_highbd_idct32x32_135_add neon/;
|
||||||
|
@ -8,237 +8,516 @@
|
|||||||
* be found in the AUTHORS file in the root of the source tree.
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <emmintrin.h> // SSE2
|
||||||
|
|
||||||
#include "./vpx_dsp_rtcd.h"
|
#include "./vpx_dsp_rtcd.h"
|
||||||
#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
|
#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
|
||||||
#include "vpx_dsp/x86/inv_txfm_sse2.h"
|
#include "vpx_dsp/x86/inv_txfm_sse2.h"
|
||||||
#include "vpx_dsp/x86/transpose_sse2.h"
|
#include "vpx_dsp/x86/transpose_sse2.h"
|
||||||
#include "vpx_dsp/x86/txfm_common_sse2.h"
|
#include "vpx_dsp/x86/txfm_common_sse2.h"
|
||||||
|
|
||||||
|
static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) {
|
||||||
|
const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0));
|
||||||
|
const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4));
|
||||||
|
return _mm_packs_epi32(t0, t1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE void highbd_write_buffer_8x1(uint16_t *dest, const __m128i in,
|
||||||
|
const int bd) {
|
||||||
|
const __m128i final_rounding = _mm_set1_epi16(1 << 5);
|
||||||
|
__m128i out;
|
||||||
|
|
||||||
|
out = _mm_adds_epi16(in, final_rounding);
|
||||||
|
out = _mm_srai_epi16(out, 6);
|
||||||
|
recon_and_store_8_kernel(out, &dest, 0, bd);
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE void recon_and_store_4_kernel(const __m128i in,
|
||||||
|
uint16_t *const dest,
|
||||||
|
const int bd) {
|
||||||
|
__m128i d;
|
||||||
|
|
||||||
|
d = _mm_loadl_epi64((const __m128i *)dest);
|
||||||
|
d = add_clamp(d, in, bd);
|
||||||
|
_mm_storel_epi64((__m128i *)dest, d);
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE void highbd_write_buffer_4x1(uint16_t *const dest,
|
||||||
|
const __m128i in, const int bd) {
|
||||||
|
const __m128i final_rounding = _mm_set1_epi32(1 << 5);
|
||||||
|
__m128i out;
|
||||||
|
|
||||||
|
out = _mm_add_epi32(in, final_rounding);
|
||||||
|
out = _mm_srai_epi32(out, 6);
|
||||||
|
out = _mm_packs_epi32(out, out);
|
||||||
|
recon_and_store_4_kernel(out, dest, bd);
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
|
||||||
|
__m128i *const out) {
|
||||||
|
__m128i temp1[2], temp2, sign[2];
|
||||||
|
// stage 5
|
||||||
|
out[0] = _mm_add_epi32(in[0], in[3]);
|
||||||
|
out[1] = _mm_add_epi32(in[1], in[2]);
|
||||||
|
out[2] = _mm_sub_epi32(in[1], in[2]);
|
||||||
|
out[3] = _mm_sub_epi32(in[0], in[3]);
|
||||||
|
temp2 = _mm_sub_epi32(in[6], in[5]);
|
||||||
|
abs_extend_64bit_sse2(temp2, temp1, sign);
|
||||||
|
out[5] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||||
|
temp2 = _mm_add_epi32(in[6], in[5]);
|
||||||
|
abs_extend_64bit_sse2(temp2, temp1, sign);
|
||||||
|
out[6] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||||
|
out[8] = _mm_add_epi32(in[8], in[11]);
|
||||||
|
out[9] = _mm_add_epi32(in[9], in[10]);
|
||||||
|
out[10] = _mm_sub_epi32(in[9], in[10]);
|
||||||
|
out[11] = _mm_sub_epi32(in[8], in[11]);
|
||||||
|
out[12] = _mm_sub_epi32(in[15], in[12]);
|
||||||
|
out[13] = _mm_sub_epi32(in[14], in[13]);
|
||||||
|
out[14] = _mm_add_epi32(in[14], in[13]);
|
||||||
|
out[15] = _mm_add_epi32(in[15], in[12]);
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
|
||||||
|
__m128i *const out) {
|
||||||
|
__m128i temp1[2], temp2, sign[2];
|
||||||
|
out[0] = _mm_add_epi32(in[0], in[7]);
|
||||||
|
out[1] = _mm_add_epi32(in[1], in[6]);
|
||||||
|
out[2] = _mm_add_epi32(in[2], in[5]);
|
||||||
|
out[3] = _mm_add_epi32(in[3], in[4]);
|
||||||
|
out[4] = _mm_sub_epi32(in[3], in[4]);
|
||||||
|
out[5] = _mm_sub_epi32(in[2], in[5]);
|
||||||
|
out[6] = _mm_sub_epi32(in[1], in[6]);
|
||||||
|
out[7] = _mm_sub_epi32(in[0], in[7]);
|
||||||
|
out[8] = in[8];
|
||||||
|
out[9] = in[9];
|
||||||
|
temp2 = _mm_sub_epi32(in[13], in[10]);
|
||||||
|
abs_extend_64bit_sse2(temp2, temp1, sign);
|
||||||
|
out[10] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||||
|
temp2 = _mm_add_epi32(in[13], in[10]);
|
||||||
|
abs_extend_64bit_sse2(temp2, temp1, sign);
|
||||||
|
out[13] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||||
|
|
||||||
|
temp2 = _mm_sub_epi32(in[12], in[11]);
|
||||||
|
abs_extend_64bit_sse2(temp2, temp1, sign);
|
||||||
|
out[11] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||||
|
temp2 = _mm_add_epi32(in[12], in[11]);
|
||||||
|
abs_extend_64bit_sse2(temp2, temp1, sign);
|
||||||
|
out[12] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||||
|
out[14] = in[14];
|
||||||
|
out[15] = in[15];
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
|
||||||
|
__m128i *const out) {
|
||||||
|
out[0] = _mm_add_epi32(in[0], in[15]);
|
||||||
|
out[1] = _mm_add_epi32(in[1], in[14]);
|
||||||
|
out[2] = _mm_add_epi32(in[2], in[13]);
|
||||||
|
out[3] = _mm_add_epi32(in[3], in[12]);
|
||||||
|
out[4] = _mm_add_epi32(in[4], in[11]);
|
||||||
|
out[5] = _mm_add_epi32(in[5], in[10]);
|
||||||
|
out[6] = _mm_add_epi32(in[6], in[9]);
|
||||||
|
out[7] = _mm_add_epi32(in[7], in[8]);
|
||||||
|
out[8] = _mm_sub_epi32(in[7], in[8]);
|
||||||
|
out[9] = _mm_sub_epi32(in[6], in[9]);
|
||||||
|
out[10] = _mm_sub_epi32(in[5], in[10]);
|
||||||
|
out[11] = _mm_sub_epi32(in[4], in[11]);
|
||||||
|
out[12] = _mm_sub_epi32(in[3], in[12]);
|
||||||
|
out[13] = _mm_sub_epi32(in[2], in[13]);
|
||||||
|
out[14] = _mm_sub_epi32(in[1], in[14]);
|
||||||
|
out[15] = _mm_sub_epi32(in[0], in[15]);
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
|
||||||
|
__m128i step1[16], step2[16];
|
||||||
|
__m128i temp1[4], temp2, sign[2];
|
||||||
|
|
||||||
|
// stage 2
|
||||||
|
highbd_multiplication_and_add_sse2(io[1], io[15], (int)cospi_30_64,
|
||||||
|
(int)cospi_2_64, &step2[8], &step2[15]);
|
||||||
|
highbd_multiplication_and_add_sse2(io[9], io[7], (int)cospi_14_64,
|
||||||
|
(int)cospi_18_64, &step2[9], &step2[14]);
|
||||||
|
highbd_multiplication_and_add_sse2(io[5], io[11], (int)cospi_22_64,
|
||||||
|
(int)cospi_10_64, &step2[10], &step2[13]);
|
||||||
|
highbd_multiplication_and_add_sse2(io[13], io[3], (int)cospi_6_64,
|
||||||
|
(int)cospi_26_64, &step2[11], &step2[12]);
|
||||||
|
|
||||||
|
// stage 3
|
||||||
|
highbd_multiplication_and_add_sse2(io[2], io[14], (int)cospi_28_64,
|
||||||
|
(int)cospi_4_64, &step1[4], &step1[7]);
|
||||||
|
highbd_multiplication_and_add_sse2(io[10], io[6], (int)cospi_12_64,
|
||||||
|
(int)cospi_20_64, &step1[5], &step1[6]);
|
||||||
|
step1[8] = _mm_add_epi32(step2[8], step2[9]);
|
||||||
|
step1[9] = _mm_sub_epi32(step2[8], step2[9]);
|
||||||
|
step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
|
||||||
|
step1[11] = _mm_add_epi32(step2[10], step2[11]);
|
||||||
|
step1[12] = _mm_add_epi32(step2[13], step2[12]);
|
||||||
|
step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
|
||||||
|
step1[14] = _mm_sub_epi32(step2[15], step2[14]);
|
||||||
|
step1[15] = _mm_add_epi32(step2[15], step2[14]);
|
||||||
|
|
||||||
|
// stage 4
|
||||||
|
temp2 = _mm_add_epi32(io[0], io[8]);
|
||||||
|
abs_extend_64bit_sse2(temp2, temp1, sign);
|
||||||
|
step2[0] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||||
|
temp2 = _mm_sub_epi32(io[0], io[8]);
|
||||||
|
abs_extend_64bit_sse2(temp2, temp1, sign);
|
||||||
|
step2[1] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||||
|
highbd_multiplication_and_add_sse2(io[4], io[12], (int)cospi_24_64,
|
||||||
|
(int)cospi_8_64, &step2[2], &step2[3]);
|
||||||
|
highbd_multiplication_and_add_sse2(step1[14], step1[9], (int)cospi_24_64,
|
||||||
|
(int)cospi_8_64, &step2[9], &step2[14]);
|
||||||
|
highbd_multiplication_and_add_sse2(step1[10], step1[13], (int)cospi_8_64,
|
||||||
|
(int)cospi_24_64, &step2[13], &step2[10]);
|
||||||
|
step2[5] = _mm_sub_epi32(step1[4], step1[5]);
|
||||||
|
step1[4] = _mm_add_epi32(step1[4], step1[5]);
|
||||||
|
step2[6] = _mm_sub_epi32(step1[7], step1[6]);
|
||||||
|
step1[7] = _mm_add_epi32(step1[7], step1[6]);
|
||||||
|
step2[8] = step1[8];
|
||||||
|
step2[11] = step1[11];
|
||||||
|
step2[12] = step1[12];
|
||||||
|
step2[15] = step1[15];
|
||||||
|
|
||||||
|
highbd_idct16_4col_stage5(step2, step1);
|
||||||
|
highbd_idct16_4col_stage6(step1, step2);
|
||||||
|
highbd_idct16_4col_stage7(step2, io);
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
|
||||||
|
__m128i step1[16], step2[16];
|
||||||
|
__m128i temp1[2], sign[2];
|
||||||
|
|
||||||
|
// stage 2
|
||||||
|
highbd_multiplication_sse2(io[1], (int)cospi_30_64, (int)cospi_2_64,
|
||||||
|
&step2[8], &step2[15]);
|
||||||
|
highbd_multiplication_neg_sse2(io[7], (int)cospi_14_64, (int)cospi_18_64,
|
||||||
|
&step2[9], &step2[14]);
|
||||||
|
highbd_multiplication_sse2(io[5], (int)cospi_22_64, (int)cospi_10_64,
|
||||||
|
&step2[10], &step2[13]);
|
||||||
|
highbd_multiplication_neg_sse2(io[3], (int)cospi_6_64, (int)cospi_26_64,
|
||||||
|
&step2[11], &step2[12]);
|
||||||
|
|
||||||
|
// stage 3
|
||||||
|
highbd_multiplication_sse2(io[2], (int)cospi_28_64, (int)cospi_4_64,
|
||||||
|
&step1[4], &step1[7]);
|
||||||
|
highbd_multiplication_neg_sse2(io[6], (int)cospi_12_64, (int)cospi_20_64,
|
||||||
|
&step1[5], &step1[6]);
|
||||||
|
step1[8] = _mm_add_epi32(step2[8], step2[9]);
|
||||||
|
step1[9] = _mm_sub_epi32(step2[8], step2[9]);
|
||||||
|
step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
|
||||||
|
step1[11] = _mm_add_epi32(step2[10], step2[11]);
|
||||||
|
step1[12] = _mm_add_epi32(step2[13], step2[12]);
|
||||||
|
step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
|
||||||
|
step1[14] = _mm_sub_epi32(step2[15], step2[14]);
|
||||||
|
step1[15] = _mm_add_epi32(step2[15], step2[14]);
|
||||||
|
|
||||||
|
// stage 4
|
||||||
|
abs_extend_64bit_sse2(io[0], temp1, sign);
|
||||||
|
step2[0] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
|
||||||
|
step2[1] = step2[0];
|
||||||
|
highbd_multiplication_sse2(io[4], (int)cospi_24_64, (int)cospi_8_64,
|
||||||
|
&step2[2], &step2[3]);
|
||||||
|
highbd_multiplication_and_add_sse2(step1[14], step1[9], (int)cospi_24_64,
|
||||||
|
(int)cospi_8_64, &step2[9], &step2[14]);
|
||||||
|
highbd_multiplication_and_add_sse2(step1[10], step1[13], (int)cospi_8_64,
|
||||||
|
(int)cospi_24_64, &step2[13], &step2[10]);
|
||||||
|
step2[5] = _mm_sub_epi32(step1[4], step1[5]);
|
||||||
|
step1[4] = _mm_add_epi32(step1[4], step1[5]);
|
||||||
|
step2[6] = _mm_sub_epi32(step1[7], step1[6]);
|
||||||
|
step1[7] = _mm_add_epi32(step1[7], step1[6]);
|
||||||
|
step2[8] = step1[8];
|
||||||
|
step2[11] = step1[11];
|
||||||
|
step2[12] = step1[12];
|
||||||
|
step2[15] = step1[15];
|
||||||
|
|
||||||
|
highbd_idct16_4col_stage5(step2, step1);
|
||||||
|
highbd_idct16_4col_stage6(step1, step2);
|
||||||
|
highbd_idct16_4col_stage7(step2, io);
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
|
||||||
|
__m128i step1[16], step2[16];
|
||||||
|
__m128i temp[2], sign[2];
|
||||||
|
|
||||||
|
// stage 2
|
||||||
|
highbd_multiplication_sse2(io[1], (int)cospi_30_64, (int)cospi_2_64,
|
||||||
|
&step2[8], &step2[15]);
|
||||||
|
highbd_multiplication_neg_sse2(io[3], (int)cospi_6_64, (int)cospi_26_64,
|
||||||
|
&step2[11], &step2[12]);
|
||||||
|
|
||||||
|
// stage 3
|
||||||
|
highbd_multiplication_sse2(io[2], (int)cospi_28_64, (int)cospi_4_64,
|
||||||
|
&step1[4], &step1[7]);
|
||||||
|
step1[8] = step2[8];
|
||||||
|
step1[9] = step2[8];
|
||||||
|
step1[10] =
|
||||||
|
_mm_sub_epi32(_mm_setzero_si128(), step2[11]); // step1[10] = -step1[10]
|
||||||
|
step1[11] = step2[11];
|
||||||
|
step1[12] = step2[12];
|
||||||
|
step1[13] =
|
||||||
|
_mm_sub_epi32(_mm_setzero_si128(), step2[12]); // step1[13] = -step1[13]
|
||||||
|
step1[14] = step2[15];
|
||||||
|
step1[15] = step2[15];
|
||||||
|
|
||||||
|
// stage 4
|
||||||
|
abs_extend_64bit_sse2(io[0], temp, sign);
|
||||||
|
step2[0] = multiplication_round_shift_sse2(temp, sign, (int)cospi_16_64);
|
||||||
|
step2[1] = step2[0];
|
||||||
|
step2[2] = _mm_setzero_si128();
|
||||||
|
step2[3] = _mm_setzero_si128();
|
||||||
|
highbd_multiplication_and_add_sse2(step1[14], step1[9], (int)cospi_24_64,
|
||||||
|
(int)cospi_8_64, &step2[9], &step2[14]);
|
||||||
|
highbd_multiplication_and_add_sse2(step1[10], step1[13], (int)cospi_8_64,
|
||||||
|
(int)cospi_24_64, &step2[13], &step2[10]);
|
||||||
|
step2[5] = step1[4];
|
||||||
|
step2[6] = step1[7];
|
||||||
|
step2[8] = step1[8];
|
||||||
|
step2[11] = step1[11];
|
||||||
|
step2[12] = step1[12];
|
||||||
|
step2[15] = step1[15];
|
||||||
|
|
||||||
|
highbd_idct16_4col_stage5(step2, step1);
|
||||||
|
highbd_idct16_4col_stage6(step1, step2);
|
||||||
|
highbd_idct16_4col_stage7(step2, io);
|
||||||
|
}
|
||||||
|
|
||||||
void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
|
void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||||
int stride, int bd) {
|
int stride, int bd) {
|
||||||
tran_low_t out[16 * 16];
|
int i;
|
||||||
tran_low_t *outptr = out;
|
__m128i out[16], *in;
|
||||||
int i, j, test;
|
|
||||||
__m128i inptr[32];
|
|
||||||
__m128i min_input, max_input, temp1, temp2, sign_bits;
|
|
||||||
const __m128i zero = _mm_set1_epi16(0);
|
|
||||||
const __m128i rounding = _mm_set1_epi16(32);
|
|
||||||
const __m128i max = _mm_set1_epi16(3155);
|
|
||||||
const __m128i min = _mm_set1_epi16(-3155);
|
|
||||||
int optimised_cols = 0;
|
|
||||||
|
|
||||||
// Load input into __m128i & pack to 16 bits
|
if (bd == 8) {
|
||||||
for (i = 0; i < 16; i++) {
|
__m128i l[16], r[16];
|
||||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
|
|
||||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
|
|
||||||
inptr[i] = _mm_packs_epi32(temp1, temp2);
|
|
||||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
|
|
||||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
|
|
||||||
inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Find the min & max for the row transform
|
in = l;
|
||||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
for (i = 0; i < 2; i++) {
|
||||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
in[0] = load_pack_8_32bit(input + 0 * 16);
|
||||||
for (i = 2; i < 32; i++) {
|
in[1] = load_pack_8_32bit(input + 1 * 16);
|
||||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
in[2] = load_pack_8_32bit(input + 2 * 16);
|
||||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
in[3] = load_pack_8_32bit(input + 3 * 16);
|
||||||
}
|
in[4] = load_pack_8_32bit(input + 4 * 16);
|
||||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
in[5] = load_pack_8_32bit(input + 5 * 16);
|
||||||
min_input = _mm_cmplt_epi16(min_input, min);
|
in[6] = load_pack_8_32bit(input + 6 * 16);
|
||||||
temp1 = _mm_or_si128(max_input, min_input);
|
in[7] = load_pack_8_32bit(input + 7 * 16);
|
||||||
test = _mm_movemask_epi8(temp1);
|
transpose_16bit_8x8(in, in);
|
||||||
|
|
||||||
if (!test) {
|
in[8] = load_pack_8_32bit(input + 0 * 16 + 8);
|
||||||
// Do the row transform
|
in[9] = load_pack_8_32bit(input + 1 * 16 + 8);
|
||||||
idct16_sse2(inptr, inptr + 16);
|
in[10] = load_pack_8_32bit(input + 2 * 16 + 8);
|
||||||
|
in[11] = load_pack_8_32bit(input + 3 * 16 + 8);
|
||||||
// Find the min & max for the column transform
|
in[12] = load_pack_8_32bit(input + 4 * 16 + 8);
|
||||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
in[13] = load_pack_8_32bit(input + 5 * 16 + 8);
|
||||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
in[14] = load_pack_8_32bit(input + 6 * 16 + 8);
|
||||||
for (i = 2; i < 32; i++) {
|
in[15] = load_pack_8_32bit(input + 7 * 16 + 8);
|
||||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
transpose_16bit_8x8(in + 8, in + 8);
|
||||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
idct16_8col(in);
|
||||||
|
in = r;
|
||||||
|
input += 128;
|
||||||
}
|
}
|
||||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
|
||||||
min_input = _mm_cmplt_epi16(min_input, min);
|
|
||||||
temp1 = _mm_or_si128(max_input, min_input);
|
|
||||||
test = _mm_movemask_epi8(temp1);
|
|
||||||
|
|
||||||
if (test) {
|
for (i = 0; i < 2; i++) {
|
||||||
transpose_16bit_16x16(inptr, inptr + 16);
|
int j;
|
||||||
for (i = 0; i < 16; i++) {
|
transpose_16bit_8x8(l + i * 8, out);
|
||||||
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
|
transpose_16bit_8x8(r + i * 8, out + 8);
|
||||||
temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
|
idct16_8col(out);
|
||||||
temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
|
|
||||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
|
|
||||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
|
|
||||||
sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
|
|
||||||
temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
|
|
||||||
temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
|
|
||||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
|
|
||||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Set to use the optimised transform for the column
|
|
||||||
optimised_cols = 1;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Run the un-optimised row transform
|
|
||||||
for (i = 0; i < 16; ++i) {
|
|
||||||
vpx_highbd_idct16_c(input, outptr, bd);
|
|
||||||
input += 16;
|
|
||||||
outptr += 16;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (optimised_cols) {
|
|
||||||
idct16_sse2(inptr, inptr + 16);
|
|
||||||
|
|
||||||
// Final round & shift and Reconstruction and Store
|
|
||||||
{
|
|
||||||
__m128i d[2];
|
|
||||||
for (i = 0; i < 16; i++) {
|
|
||||||
inptr[i] = _mm_add_epi16(inptr[i], rounding);
|
|
||||||
inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
|
|
||||||
d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
|
|
||||||
d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
|
|
||||||
inptr[i] = _mm_srai_epi16(inptr[i], 6);
|
|
||||||
inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
|
|
||||||
d[0] = add_clamp(d[0], inptr[i], bd);
|
|
||||||
d[1] = add_clamp(d[1], inptr[i + 16], bd);
|
|
||||||
// Store
|
|
||||||
_mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
|
|
||||||
_mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Run the un-optimised column transform
|
|
||||||
tran_low_t temp_in[16], temp_out[16];
|
|
||||||
for (i = 0; i < 16; ++i) {
|
|
||||||
for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
|
|
||||||
vpx_highbd_idct16_c(temp_in, temp_out, bd);
|
|
||||||
for (j = 0; j < 16; ++j) {
|
for (j = 0; j < 16; ++j) {
|
||||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
highbd_write_buffer_8x1(dest + j * stride, out[j], bd);
|
||||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
|
|
||||||
}
|
}
|
||||||
|
dest += 8;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
__m128i all[4][16];
|
||||||
|
|
||||||
|
for (i = 0; i < 4; i++) {
|
||||||
|
in = all[i];
|
||||||
|
in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 0));
|
||||||
|
in[1] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 4));
|
||||||
|
in[2] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 0));
|
||||||
|
in[3] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 4));
|
||||||
|
in[4] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 0));
|
||||||
|
in[5] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 4));
|
||||||
|
in[6] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 0));
|
||||||
|
in[7] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 4));
|
||||||
|
transpose_32bit_8x4(in, in);
|
||||||
|
|
||||||
|
in[8] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 8));
|
||||||
|
in[9] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 12));
|
||||||
|
in[10] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 8));
|
||||||
|
in[11] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 12));
|
||||||
|
in[12] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 8));
|
||||||
|
in[13] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 12));
|
||||||
|
in[14] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 8));
|
||||||
|
in[15] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 12));
|
||||||
|
transpose_32bit_8x4(in + 8, in + 8);
|
||||||
|
|
||||||
|
highbd_idct16_4col(in);
|
||||||
|
input += 4 * 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < 4; i++) {
|
||||||
|
int j;
|
||||||
|
out[0] = all[0][4 * i + 0];
|
||||||
|
out[1] = all[1][4 * i + 0];
|
||||||
|
out[2] = all[0][4 * i + 1];
|
||||||
|
out[3] = all[1][4 * i + 1];
|
||||||
|
out[4] = all[0][4 * i + 2];
|
||||||
|
out[5] = all[1][4 * i + 2];
|
||||||
|
out[6] = all[0][4 * i + 3];
|
||||||
|
out[7] = all[1][4 * i + 3];
|
||||||
|
transpose_32bit_8x4(out, out);
|
||||||
|
|
||||||
|
out[8] = all[2][4 * i + 0];
|
||||||
|
out[9] = all[3][4 * i + 0];
|
||||||
|
out[10] = all[2][4 * i + 1];
|
||||||
|
out[11] = all[3][4 * i + 1];
|
||||||
|
out[12] = all[2][4 * i + 2];
|
||||||
|
out[13] = all[3][4 * i + 2];
|
||||||
|
out[14] = all[2][4 * i + 3];
|
||||||
|
out[15] = all[3][4 * i + 3];
|
||||||
|
transpose_32bit_8x4(out + 8, out + 8);
|
||||||
|
|
||||||
|
highbd_idct16_4col(out);
|
||||||
|
|
||||||
|
for (j = 0; j < 16; ++j) {
|
||||||
|
highbd_write_buffer_4x1(dest + j * stride, out[j], bd);
|
||||||
|
}
|
||||||
|
dest += 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||||
|
int stride, int bd) {
|
||||||
|
int i;
|
||||||
|
__m128i out[16];
|
||||||
|
|
||||||
|
if (bd == 8) {
|
||||||
|
__m128i in[16];
|
||||||
|
|
||||||
|
in[0] = load_pack_8_32bit(input + 0 * 16);
|
||||||
|
in[1] = load_pack_8_32bit(input + 1 * 16);
|
||||||
|
in[2] = load_pack_8_32bit(input + 2 * 16);
|
||||||
|
in[3] = load_pack_8_32bit(input + 3 * 16);
|
||||||
|
in[4] = load_pack_8_32bit(input + 4 * 16);
|
||||||
|
in[5] = load_pack_8_32bit(input + 5 * 16);
|
||||||
|
in[6] = load_pack_8_32bit(input + 6 * 16);
|
||||||
|
in[7] = load_pack_8_32bit(input + 7 * 16);
|
||||||
|
transpose_16bit_8x8(in, in);
|
||||||
|
|
||||||
|
in[8] = _mm_setzero_si128();
|
||||||
|
in[9] = _mm_setzero_si128();
|
||||||
|
in[10] = _mm_setzero_si128();
|
||||||
|
in[11] = _mm_setzero_si128();
|
||||||
|
in[12] = _mm_setzero_si128();
|
||||||
|
in[13] = _mm_setzero_si128();
|
||||||
|
in[14] = _mm_setzero_si128();
|
||||||
|
in[15] = _mm_setzero_si128();
|
||||||
|
idct16_8col(in);
|
||||||
|
|
||||||
|
for (i = 0; i < 2; i++) {
|
||||||
|
int j;
|
||||||
|
transpose_16bit_8x8(in + i * 8, out);
|
||||||
|
out[8] = _mm_setzero_si128();
|
||||||
|
out[9] = _mm_setzero_si128();
|
||||||
|
out[10] = _mm_setzero_si128();
|
||||||
|
out[11] = _mm_setzero_si128();
|
||||||
|
out[12] = _mm_setzero_si128();
|
||||||
|
out[13] = _mm_setzero_si128();
|
||||||
|
out[14] = _mm_setzero_si128();
|
||||||
|
out[15] = _mm_setzero_si128();
|
||||||
|
idct16_8col(out);
|
||||||
|
|
||||||
|
for (j = 0; j < 16; ++j) {
|
||||||
|
highbd_write_buffer_8x1(dest + j * stride, out[j], bd);
|
||||||
|
}
|
||||||
|
dest += 8;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
__m128i all[2][16], *in;
|
||||||
|
|
||||||
|
for (i = 0; i < 2; i++) {
|
||||||
|
in = all[i];
|
||||||
|
in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 0));
|
||||||
|
in[1] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 4));
|
||||||
|
in[2] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 0));
|
||||||
|
in[3] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 4));
|
||||||
|
in[4] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 0));
|
||||||
|
in[5] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 4));
|
||||||
|
in[6] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 0));
|
||||||
|
in[7] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 4));
|
||||||
|
transpose_32bit_8x4(in, in);
|
||||||
|
highbd_idct16x16_38_4col(in);
|
||||||
|
input += 4 * 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < 4; i++) {
|
||||||
|
int j;
|
||||||
|
out[0] = all[0][4 * i + 0];
|
||||||
|
out[1] = all[1][4 * i + 0];
|
||||||
|
out[2] = all[0][4 * i + 1];
|
||||||
|
out[3] = all[1][4 * i + 1];
|
||||||
|
out[4] = all[0][4 * i + 2];
|
||||||
|
out[5] = all[1][4 * i + 2];
|
||||||
|
out[6] = all[0][4 * i + 3];
|
||||||
|
out[7] = all[1][4 * i + 3];
|
||||||
|
transpose_32bit_8x4(out, out);
|
||||||
|
highbd_idct16x16_38_4col(out);
|
||||||
|
|
||||||
|
for (j = 0; j < 16; ++j) {
|
||||||
|
highbd_write_buffer_4x1(dest + j * stride, out[j], bd);
|
||||||
|
}
|
||||||
|
dest += 4;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
|
void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||||
int stride, int bd) {
|
int stride, int bd) {
|
||||||
tran_low_t out[16 * 16] = { 0 };
|
int i;
|
||||||
tran_low_t *outptr = out;
|
__m128i out[16];
|
||||||
int i, j, test;
|
|
||||||
__m128i inptr[32];
|
|
||||||
__m128i min_input, max_input, temp1, temp2, sign_bits;
|
|
||||||
const __m128i zero = _mm_set1_epi16(0);
|
|
||||||
const __m128i rounding = _mm_set1_epi16(32);
|
|
||||||
const __m128i max = _mm_set1_epi16(3155);
|
|
||||||
const __m128i min = _mm_set1_epi16(-3155);
|
|
||||||
int optimised_cols = 0;
|
|
||||||
|
|
||||||
// Load input into __m128i & pack to 16 bits
|
if (bd == 8) {
|
||||||
for (i = 0; i < 16; i++) {
|
__m128i in[16], l[16];
|
||||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
|
|
||||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
|
|
||||||
inptr[i] = _mm_packs_epi32(temp1, temp2);
|
|
||||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
|
|
||||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
|
|
||||||
inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Find the min & max for the row transform
|
in[0] = load_pack_8_32bit(input + 0 * 16);
|
||||||
// Since all non-zero dct coefficients are in upper-left 4x4 area,
|
in[1] = load_pack_8_32bit(input + 1 * 16);
|
||||||
// we only need to consider first 4 rows here.
|
in[2] = load_pack_8_32bit(input + 2 * 16);
|
||||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
in[3] = load_pack_8_32bit(input + 3 * 16);
|
||||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
|
||||||
for (i = 2; i < 4; i++) {
|
|
||||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
|
||||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
|
||||||
}
|
|
||||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
|
||||||
min_input = _mm_cmplt_epi16(min_input, min);
|
|
||||||
temp1 = _mm_or_si128(max_input, min_input);
|
|
||||||
test = _mm_movemask_epi8(temp1);
|
|
||||||
|
|
||||||
if (!test) {
|
idct16x16_10_pass1(in, l);
|
||||||
// Do the row transform (N.B. This transposes inptr)
|
|
||||||
idct16_sse2(inptr, inptr + 16);
|
|
||||||
|
|
||||||
// Find the min & max for the column transform
|
for (i = 0; i < 2; i++) {
|
||||||
// N.B. Only first 4 cols contain non-zero coeffs
|
int j;
|
||||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
idct16x16_10_pass2(l + 8 * i, in);
|
||||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
|
||||||
for (i = 2; i < 16; i++) {
|
|
||||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
|
||||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
|
||||||
}
|
|
||||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
|
||||||
min_input = _mm_cmplt_epi16(min_input, min);
|
|
||||||
temp1 = _mm_or_si128(max_input, min_input);
|
|
||||||
test = _mm_movemask_epi8(temp1);
|
|
||||||
|
|
||||||
if (test) {
|
|
||||||
// Use fact only first 4 rows contain non-zero coeffs
|
|
||||||
transpose_16bit_8x8(inptr, inptr);
|
|
||||||
transpose_16bit_8x8(inptr + 8, inptr + 16);
|
|
||||||
for (i = 0; i < 4; i++) {
|
|
||||||
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
|
|
||||||
temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
|
|
||||||
temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
|
|
||||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
|
|
||||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
|
|
||||||
sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
|
|
||||||
temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
|
|
||||||
temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
|
|
||||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
|
|
||||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Set to use the optimised transform for the column
|
|
||||||
optimised_cols = 1;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Run the un-optimised row transform
|
|
||||||
for (i = 0; i < 4; ++i) {
|
|
||||||
vpx_highbd_idct16_c(input, outptr, bd);
|
|
||||||
input += 16;
|
|
||||||
outptr += 16;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (optimised_cols) {
|
|
||||||
idct16_sse2(inptr, inptr + 16);
|
|
||||||
|
|
||||||
// Final round & shift and Reconstruction and Store
|
|
||||||
{
|
|
||||||
__m128i d[2];
|
|
||||||
for (i = 0; i < 16; i++) {
|
|
||||||
inptr[i] = _mm_add_epi16(inptr[i], rounding);
|
|
||||||
inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
|
|
||||||
d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
|
|
||||||
d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
|
|
||||||
inptr[i] = _mm_srai_epi16(inptr[i], 6);
|
|
||||||
inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
|
|
||||||
d[0] = add_clamp(d[0], inptr[i], bd);
|
|
||||||
d[1] = add_clamp(d[1], inptr[i + 16], bd);
|
|
||||||
// Store
|
|
||||||
_mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
|
|
||||||
_mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Run the un-optimised column transform
|
|
||||||
tran_low_t temp_in[16], temp_out[16];
|
|
||||||
for (i = 0; i < 16; ++i) {
|
|
||||||
for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
|
|
||||||
vpx_highbd_idct16_c(temp_in, temp_out, bd);
|
|
||||||
for (j = 0; j < 16; ++j) {
|
for (j = 0; j < 16; ++j) {
|
||||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
highbd_write_buffer_8x1(dest + j * stride, in[j], bd);
|
||||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
|
|
||||||
}
|
}
|
||||||
|
dest += 8;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
__m128i all[2][16], *in;
|
||||||
|
|
||||||
|
for (i = 0; i < 2; i++) {
|
||||||
|
in = all[i];
|
||||||
|
in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16));
|
||||||
|
in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16));
|
||||||
|
in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16));
|
||||||
|
in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16));
|
||||||
|
transpose_32bit_4x4(in, in);
|
||||||
|
highbd_idct16x16_10_4col(in);
|
||||||
|
input += 4 * 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < 4; i++) {
|
||||||
|
int j;
|
||||||
|
transpose_32bit_4x4(&all[0][4 * i], out);
|
||||||
|
highbd_idct16x16_10_4col(out);
|
||||||
|
|
||||||
|
for (j = 0; j < 16; ++j) {
|
||||||
|
highbd_write_buffer_4x1(dest + j * stride, out[j], bd);
|
||||||
|
}
|
||||||
|
dest += 4;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -115,7 +115,7 @@ static INLINE void highbd_multiplication_and_add_sse2(
|
|||||||
__m128i *const out0, __m128i *const out1) {
|
__m128i *const out0, __m128i *const out1) {
|
||||||
const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0);
|
const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0);
|
||||||
const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0);
|
const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0);
|
||||||
__m128i temp1[4], temp2[4], sign1[4], sign2[4];
|
__m128i temp1[4], temp2[4], sign1[2], sign2[2];
|
||||||
|
|
||||||
abs_extend_64bit_sse2(in0, temp1, sign1);
|
abs_extend_64bit_sse2(in0, temp1, sign1);
|
||||||
abs_extend_64bit_sse2(in1, temp2, sign2);
|
abs_extend_64bit_sse2(in1, temp2, sign2);
|
||||||
@ -139,6 +139,29 @@ static INLINE void highbd_multiplication_and_add_sse2(
|
|||||||
*out1 = pack_4(temp2[0], temp2[1]);
|
*out1 = pack_4(temp2[0], temp2[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Note: c0 and c1 must be non negative.
|
||||||
|
static INLINE void highbd_multiplication_sse2(const __m128i in, const int c0,
|
||||||
|
const int c1, __m128i *const out0,
|
||||||
|
__m128i *const out1) {
|
||||||
|
__m128i temp[2], sign[2];
|
||||||
|
|
||||||
|
abs_extend_64bit_sse2(in, temp, sign);
|
||||||
|
*out0 = multiplication_round_shift_sse2(temp, sign, c0);
|
||||||
|
*out1 = multiplication_round_shift_sse2(temp, sign, c1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note: c0 and c1 must be non negative.
|
||||||
|
static INLINE void highbd_multiplication_neg_sse2(const __m128i in,
|
||||||
|
const int c0, const int c1,
|
||||||
|
__m128i *const out0,
|
||||||
|
__m128i *const out1) {
|
||||||
|
__m128i temp[2], sign[2];
|
||||||
|
|
||||||
|
abs_extend_64bit_sse2(in, temp, sign);
|
||||||
|
*out0 = multiplication_neg_round_shift_sse2(temp, sign, c1);
|
||||||
|
*out1 = multiplication_round_shift_sse2(temp, sign, c0);
|
||||||
|
}
|
||||||
|
|
||||||
static INLINE void highbd_idct8_stage4(const __m128i *const in,
|
static INLINE void highbd_idct8_stage4(const __m128i *const in,
|
||||||
__m128i *const out) {
|
__m128i *const out) {
|
||||||
out[0] = _mm_add_epi32(in[0], in[7]);
|
out[0] = _mm_add_epi32(in[0], in[7]);
|
||||||
|
Loading…
Reference in New Issue
Block a user