Merge "Update quantize SSSE3 SIMD to cover 32x32 transform case also."

This commit is contained in:
Ronald S. Bultje 2013-07-02 09:38:08 -07:00 committed by Gerrit Code Review
commit 9df24b41ca
3 changed files with 55 additions and 20 deletions

View File

@ -569,6 +569,9 @@ specialize vp9_subtract_block sse2
prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
specialize vp9_quantize_b $ssse3_x86_64
prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
specialize vp9_quantize_b_32x32 $ssse3_x86_64
#
# Structured Similarity (SSIM)
#

View File

@ -85,18 +85,19 @@ void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
}
// This function works well for large transform size.
static void quantize_sparse(int16_t *coeff_ptr, intptr_t n_coeffs,
void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
int16_t *zbin_ptr, int16_t *round_ptr,
int16_t *quant_ptr, int16_t *quant_shift_ptr,
int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
int16_t *dequant_ptr, int zbin_oq_value,
uint16_t *eob_ptr, const int16_t *scan,
int *idx_arr) {
const int16_t *iscan) {
int i, rc, eob;
int zbins[2], nzbins[2], zbin;
int x, y, z, sz;
int idx = 0;
int idx_arr[1024];
vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
@ -179,20 +180,18 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
// Call different quantization for different transform size.
if (n_coeffs >= 1024) {
// Save index of picked coefficient in pre-scan pass.
int idx_arr[1024];
quantize_sparse(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
n_coeffs, mb->skip_block,
mb->plane[plane].zbin,
mb->plane[plane].round,
mb->plane[plane].quant,
mb->plane[plane].quant_shift,
BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
xd->plane[plane].dequant,
mb->plane[plane].zbin_extra,
&xd->plane[plane].eobs[block],
scan, idx_arr);
vp9_quantize_b_32x32(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
n_coeffs, mb->skip_block,
mb->plane[plane].zbin,
mb->plane[plane].round,
mb->plane[plane].quant,
mb->plane[plane].quant_shift,
BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
xd->plane[plane].dequant,
mb->plane[plane].zbin_extra,
&xd->plane[plane].eobs[block],
scan, iscan);
}
else {
vp9_quantize_b(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),

View File

@ -15,10 +15,10 @@ pw_1: times 8 dw 1
SECTION .text
INIT_XMM ssse3
cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
shift, qcoeff, dqcoeff, dequant, zbin_oq, \
eob, scan, iscan
%macro QUANTIZE_FN 1
cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
shift, qcoeff, dqcoeff, dequant, zbin_oq, \
eob, scan, iscan
cmp dword skipm, 0
jne .blank
@ -57,6 +57,10 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
%ifidn %1, b_32x32
paddw m6, m6
paddw m11, m11
%endif
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
punpckhqdq m0, m0
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
@ -77,9 +81,19 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
pand m13, m12
mova [qcoeffq+ncoeffq*2+ 0], m8
mova [qcoeffq+ncoeffq*2+16], m13
%ifidn %1, b_32x32
pabsw m8, m8
pabsw m13, m13
%endif
pmullw m8, m3 ; dqc[i] = qc[i] * q
punpckhqdq m3, m3
pmullw m13, m3 ; dqc[i] = qc[i] * q
%ifidn %1, b_32x32
psrlw m8, 1
psrlw m13, 1
psignw m8, m9
psignw m13, m10
%endif
mova [dqcoeffq+ncoeffq*2+ 0], m8
mova [dqcoeffq+ncoeffq*2+16], m13
pcmpeqw m8, m5 ; m8 = c[i] == 0
@ -99,6 +113,10 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
%ifidn %1, b_32x32
paddw m6, m6
paddw m11, m11
%endif
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
paddw m6, m1 ; m6 += round
@ -115,8 +133,18 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
pand m13, m12
mova [qcoeffq+ncoeffq*2+ 0], m14
mova [qcoeffq+ncoeffq*2+16], m13
%ifidn %1, b_32x32
pabsw m14, m14
pabsw m13, m13
%endif
pmullw m14, m3 ; dqc[i] = qc[i] * q
pmullw m13, m3 ; dqc[i] = qc[i] * q
%ifidn %1, b_32x32
psrlw m14, 1
psrlw m13, 1
psignw m14, m9
psignw m13, m10
%endif
mova [dqcoeffq+ncoeffq*2+ 0], m14
mova [dqcoeffq+ncoeffq*2+16], m13
pcmpeqw m14, m5 ; m14 = c[i] == 0
@ -163,3 +191,8 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
jl .blank_loop
mov word [eobq], 0
RET
%endmacro
INIT_XMM ssse3
QUANTIZE_FN b
QUANTIZE_FN b_32x32