SSE2 optimisation for quantize in high bit depth

When configured with high bit detpth enabled, the 8bit quantize
function stopped using optimised code. This made 8bit content
decode slowly. This commit re-enables the SSE2 optimisation
(but not the SSSE3 optimisation).

Change-Id: Id015fe3c1c44580a4bff3f4bd985170f2806a9d9
This commit is contained in:
Julia Robson 2015-10-02 10:20:06 +01:00 committed by Debargha Mukherjee
parent 7777e7a8d5
commit 5e6533e707
2 changed files with 45 additions and 20 deletions

View File

@ -849,7 +849,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_quantize_b/;
specialize qw/vpx_quantize_b sse2/;
add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_quantize_b_32x32/;

View File

@ -14,11 +14,36 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
void vpx_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
#if CONFIG_VP9_HIGHBITDEPTH
return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
(int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], (int16_t)coeff_ptr[4],
(int16_t)coeff_ptr[5], (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
#else
return _mm_load_si128((const __m128i *)coeff_ptr);
#endif
}
static INLINE void store_coefficients(__m128i coeff_vals,
tran_low_t *coeff_ptr) {
#if CONFIG_VP9_HIGHBITDEPTH
__m128i one = _mm_set1_epi16(1);
__m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
__m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
__m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
__m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
_mm_store_si128((__m128i*)(coeff_ptr), coeff_vals_1);
_mm_store_si128((__m128i*)(coeff_ptr + 4), coeff_vals_2);
#else
_mm_store_si128((__m128i*)(coeff_ptr), coeff_vals);
#endif
}
void vpx_quantize_b_sse2(const tran_low_t* coeff_ptr, intptr_t n_coeffs,
int skip_block, const int16_t* zbin_ptr,
const int16_t* round_ptr, const int16_t* quant_ptr,
const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
const int16_t* quant_shift_ptr, tran_low_t* qcoeff_ptr,
tran_low_t* dqcoeff_ptr, const int16_t* dequant_ptr,
uint16_t* eob_ptr,
const int16_t* scan_ptr,
const int16_t* iscan_ptr) {
@ -56,8 +81,8 @@ void vpx_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
__m128i qtmp0, qtmp1;
__m128i cmp_mask0, cmp_mask1;
// Do DC and first 15 AC
coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
coeff0 = load_coefficients(coeff_ptr + n_coeffs);
coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
// Poor man's sign extract
coeff0_sign = _mm_srai_epi16(coeff0, 15);
@ -92,15 +117,15 @@ void vpx_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
dequant = _mm_unpackhi_epi64(dequant, dequant);
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
}
{
@ -134,8 +159,8 @@ void vpx_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
__m128i qtmp0, qtmp1;
__m128i cmp_mask0, cmp_mask1;
coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
coeff0 = load_coefficients(coeff_ptr + n_coeffs);
coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
// Poor man's sign extract
coeff0_sign = _mm_srai_epi16(coeff0, 15);
@ -166,14 +191,14 @@ void vpx_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
}
{
@ -212,10 +237,10 @@ void vpx_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
}
} else {
do {
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
store_coefficients(zero, dqcoeff_ptr + n_coeffs);
store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8);
store_coefficients(zero, qcoeff_ptr + n_coeffs);
store_coefficients(zero, qcoeff_ptr + n_coeffs + 8);
n_coeffs += 8 * 2;
} while (n_coeffs < 0);
*eob_ptr = 0;