Add vpx_highbd_idct{4x4,8x8,16x16}_1_add_sse2
BUG=webm:1412 Change-Id: Ia338a6057d36f9ed7eaa9cbd4dfbf0c3cbdc6468
This commit is contained in:
parent
e7cac13016
commit
c167345ffb
@ -650,6 +650,15 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = {
|
||||
make_tuple(
|
||||
&vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
|
||||
&highbd_wrapper<vpx_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 12, 2),
|
||||
make_tuple(
|
||||
&vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
|
||||
&highbd_wrapper<vpx_highbd_idct16x16_1_add_sse2>, TX_16X16, 1, 8, 2),
|
||||
make_tuple(
|
||||
&vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
|
||||
&highbd_wrapper<vpx_highbd_idct16x16_1_add_sse2>, TX_16X16, 1, 10, 2),
|
||||
make_tuple(
|
||||
&vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
|
||||
&highbd_wrapper<vpx_highbd_idct16x16_1_add_sse2>, TX_16X16, 1, 12, 2),
|
||||
make_tuple(&vpx_highbd_fdct8x8_c,
|
||||
&highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
|
||||
&highbd_wrapper<vpx_highbd_idct8x8_64_add_sse2>, TX_8X8, 64, 8, 2),
|
||||
@ -668,6 +677,12 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = {
|
||||
make_tuple(
|
||||
&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
|
||||
&highbd_wrapper<vpx_highbd_idct8x8_12_add_sse2>, TX_8X8, 12, 12, 2),
|
||||
make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,
|
||||
&highbd_wrapper<vpx_highbd_idct8x8_1_add_sse2>, TX_8X8, 1, 8, 2),
|
||||
make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,
|
||||
&highbd_wrapper<vpx_highbd_idct8x8_1_add_sse2>, TX_8X8, 1, 10, 2),
|
||||
make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,
|
||||
&highbd_wrapper<vpx_highbd_idct8x8_1_add_sse2>, TX_8X8, 1, 12, 2),
|
||||
make_tuple(&vpx_highbd_fdct4x4_c,
|
||||
&highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
|
||||
&highbd_wrapper<vpx_highbd_idct4x4_16_add_sse2>, TX_4X4, 16, 8, 2),
|
||||
@ -677,6 +692,12 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = {
|
||||
make_tuple(
|
||||
&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
|
||||
&highbd_wrapper<vpx_highbd_idct4x4_16_add_sse2>, TX_4X4, 16, 12, 2),
|
||||
make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,
|
||||
&highbd_wrapper<vpx_highbd_idct4x4_1_add_sse2>, TX_4X4, 1, 8, 2),
|
||||
make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,
|
||||
&highbd_wrapper<vpx_highbd_idct4x4_1_add_sse2>, TX_4X4, 1, 10, 2),
|
||||
make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,
|
||||
&highbd_wrapper<vpx_highbd_idct4x4_1_add_sse2>, TX_4X4, 1, 12, 2),
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
|
||||
&wrapper<vpx_idct32x32_1024_add_sse2>, TX_32X32, 1024, 8, 1),
|
||||
|
@ -629,18 +629,18 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
|
||||
add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
|
||||
add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
|
||||
specialize qw/vpx_highbd_idct4x4_1_add neon/;
|
||||
specialize qw/vpx_highbd_idct4x4_1_add neon sse2/;
|
||||
|
||||
add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
|
||||
add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
|
||||
add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
|
||||
specialize qw/vpx_highbd_idct8x8_1_add neon/;
|
||||
specialize qw/vpx_highbd_idct8x8_1_add neon sse2/;
|
||||
|
||||
add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
|
||||
add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
|
||||
add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
|
||||
add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
|
||||
specialize qw/vpx_highbd_idct16x16_1_add neon/;
|
||||
specialize qw/vpx_highbd_idct16x16_1_add neon sse2/;
|
||||
|
||||
add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
|
||||
add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
|
||||
|
@ -242,3 +242,8 @@ void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||
int stride, int bd) {
|
||||
highbd_idct_1_add_kernel(input, dest, stride, bd, 16);
|
||||
}
|
||||
|
@ -9,33 +9,12 @@
|
||||
*/
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
|
||||
#include "vpx_dsp/x86/inv_txfm_sse2.h"
|
||||
#include "vpx_dsp/x86/transpose_sse2.h"
|
||||
#include "vpx_dsp/x86/txfm_common_sse2.h"
|
||||
|
||||
void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||
int stride, int bd) {
|
||||
__m128i dc_value, d;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i one = _mm_set1_epi16(1);
|
||||
const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
|
||||
int a, i, j;
|
||||
tran_low_t out;
|
||||
|
||||
out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
|
||||
out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
|
||||
a = ROUND_POWER_OF_TWO(out, 6);
|
||||
|
||||
d = _mm_set1_epi32(a);
|
||||
dc_value = _mm_packs_epi32(d, d);
|
||||
for (i = 0; i < 32; ++i) {
|
||||
for (j = 0; j < 4; ++j) {
|
||||
d = _mm_loadu_si128((const __m128i *)(&dest[j * 8]));
|
||||
d = _mm_adds_epi16(d, dc_value);
|
||||
d = _mm_max_epi16(d, zero);
|
||||
d = _mm_min_epi16(d, max);
|
||||
_mm_storeu_si128((__m128i *)(&dest[j * 8]), d);
|
||||
}
|
||||
dest += stride;
|
||||
}
|
||||
highbd_idct_1_add_kernel(input, dest, stride, bd, 32);
|
||||
}
|
||||
|
@ -127,3 +127,26 @@ void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||
int stride, int bd) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
// Faster than _mm_set1_epi16((1 << bd) - 1).
|
||||
const __m128i one = _mm_set1_epi16(1);
|
||||
const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
|
||||
int a1, i;
|
||||
tran_low_t out;
|
||||
__m128i dc, d;
|
||||
|
||||
out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
|
||||
out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
|
||||
a1 = ROUND_POWER_OF_TWO(out, 4);
|
||||
dc = _mm_set1_epi16(a1);
|
||||
|
||||
for (i = 0; i < 4; ++i) {
|
||||
d = _mm_loadl_epi64((const __m128i *)dest);
|
||||
d = add_dc_clamp(&zero, &max, &dc, &d);
|
||||
_mm_storel_epi64((__m128i *)dest, d);
|
||||
dest += stride;
|
||||
}
|
||||
}
|
||||
|
@ -214,3 +214,8 @@ void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||
int stride, int bd) {
|
||||
highbd_idct_1_add_kernel(input, dest, stride, bd, 8);
|
||||
}
|
||||
|
@ -17,6 +17,43 @@
|
||||
#include "vpx_dsp/inv_txfm.h"
|
||||
#include "vpx_dsp/x86/txfm_common_sse2.h"
|
||||
|
||||
static INLINE __m128i add_dc_clamp(const __m128i *const min,
|
||||
const __m128i *const max,
|
||||
const __m128i *const dc,
|
||||
const __m128i *const in) {
|
||||
__m128i out;
|
||||
out = _mm_adds_epi16(*in, *dc);
|
||||
out = _mm_max_epi16(out, *min);
|
||||
out = _mm_min_epi16(out, *max);
|
||||
return out;
|
||||
}
|
||||
|
||||
static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input,
|
||||
uint16_t *dest, int stride, int bd,
|
||||
const int size) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
// Faster than _mm_set1_epi16((1 << bd) - 1).
|
||||
const __m128i one = _mm_set1_epi16(1);
|
||||
const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
|
||||
int a1, i, j;
|
||||
tran_low_t out;
|
||||
__m128i dc, d;
|
||||
|
||||
out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
|
||||
out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
|
||||
a1 = ROUND_POWER_OF_TWO(out, (size == 8) ? 5 : 6);
|
||||
dc = _mm_set1_epi16(a1);
|
||||
|
||||
for (i = 0; i < size; ++i) {
|
||||
for (j = 0; j < (size >> 3); ++j) {
|
||||
d = _mm_load_si128((const __m128i *)(&dest[j * 8]));
|
||||
d = add_dc_clamp(&zero, &max, &dc, &d);
|
||||
_mm_store_si128((__m128i *)(&dest[j * 8]), d);
|
||||
}
|
||||
dest += stride;
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
|
||||
__m128i ubounded, retval;
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
|
Loading…
x
Reference in New Issue
Block a user