Optimize quantization simd implementation
This commit allows the quantizer to compare the AC coefficients to the quantization step size to determine if further multiplication operations are needed. It makes the quantization process 20% faster without coding statistics change. Change-Id: I735aaf6a9c0874c82175bb565b20e131464db64a
This commit is contained in:
parent
502ac72233
commit
eed1badedd
@ -293,7 +293,8 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
|
||||
|
||||
if (!skip_block) {
|
||||
__m128i eob;
|
||||
__m128i round, quant, dequant;
|
||||
__m128i round, quant, dequant, thr;
|
||||
int16_t nzflag;
|
||||
{
|
||||
__m128i coeff0, coeff1;
|
||||
|
||||
@ -368,6 +369,7 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
|
||||
|
||||
// AC only loop
|
||||
index = 2;
|
||||
thr = _mm_srai_epi16(dequant, 1);
|
||||
while (n_coeffs < 0) {
|
||||
__m128i coeff0, coeff1;
|
||||
{
|
||||
@ -387,28 +389,39 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
|
||||
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
|
||||
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
|
||||
|
||||
qcoeff0 = _mm_adds_epi16(qcoeff0, round);
|
||||
qcoeff1 = _mm_adds_epi16(qcoeff1, round);
|
||||
qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
|
||||
qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
|
||||
nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
|
||||
_mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
|
||||
|
||||
// Reinsert signs
|
||||
qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
|
||||
qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
|
||||
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
|
||||
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
|
||||
if (nzflag) {
|
||||
qcoeff0 = _mm_adds_epi16(qcoeff0, round);
|
||||
qcoeff1 = _mm_adds_epi16(qcoeff1, round);
|
||||
qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
|
||||
qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
|
||||
|
||||
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
|
||||
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
|
||||
// Reinsert signs
|
||||
qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
|
||||
qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
|
||||
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
|
||||
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
|
||||
|
||||
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
|
||||
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
|
||||
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
|
||||
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
|
||||
|
||||
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
|
||||
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
|
||||
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
|
||||
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
|
||||
|
||||
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
|
||||
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
|
||||
} else {
|
||||
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
|
||||
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
|
||||
|
||||
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
|
||||
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
if (nzflag) {
|
||||
// Scan for eob
|
||||
__m128i zero_coeff0, zero_coeff1;
|
||||
__m128i nzero_coeff0, nzero_coeff1;
|
||||
|
@ -230,6 +230,8 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
|
||||
const int16_t* scan_ptr,
|
||||
const int16_t* iscan_ptr) {
|
||||
__m128i zero;
|
||||
__m128i thr;
|
||||
int16_t nzflag;
|
||||
(void)scan_ptr;
|
||||
(void)zbin_ptr;
|
||||
(void)quant_shift_ptr;
|
||||
@ -316,6 +318,8 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
|
||||
n_coeffs += 8 * 2;
|
||||
}
|
||||
|
||||
thr = _mm_srai_epi16(dequant, 1);
|
||||
|
||||
// AC only loop
|
||||
while (n_coeffs < 0) {
|
||||
__m128i coeff0, coeff1;
|
||||
@ -335,28 +339,39 @@ void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
|
||||
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
|
||||
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
|
||||
|
||||
qcoeff0 = _mm_adds_epi16(qcoeff0, round);
|
||||
qcoeff1 = _mm_adds_epi16(qcoeff1, round);
|
||||
qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
|
||||
qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
|
||||
nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
|
||||
_mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
|
||||
|
||||
// Reinsert signs
|
||||
qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
|
||||
qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
|
||||
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
|
||||
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
|
||||
if (nzflag) {
|
||||
qcoeff0 = _mm_adds_epi16(qcoeff0, round);
|
||||
qcoeff1 = _mm_adds_epi16(qcoeff1, round);
|
||||
qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
|
||||
qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
|
||||
|
||||
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
|
||||
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
|
||||
// Reinsert signs
|
||||
qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
|
||||
qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
|
||||
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
|
||||
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
|
||||
|
||||
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
|
||||
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
|
||||
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
|
||||
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
|
||||
|
||||
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
|
||||
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
|
||||
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
|
||||
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
|
||||
|
||||
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
|
||||
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
|
||||
} else {
|
||||
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
|
||||
_mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
|
||||
|
||||
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
|
||||
_mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
if (nzflag) {
|
||||
// Scan for eob
|
||||
__m128i zero_coeff0, zero_coeff1;
|
||||
__m128i nzero_coeff0, nzero_coeff1;
|
||||
|
Loading…
Reference in New Issue
Block a user