Use pmovmskb to skip quantize loops over empty coefficients.

If none of the 16 coefficients that we quantize per loop iteration
are larger than the zbin, directly skip to the next round of coeffs,
rather than doing a full quantize loop that will eventually result
in 16 zeroes. This incurs a jump cost, but saves a lot of other work.
32x32 quant goes from 1349 -> 1184 cycles. The same approach yielded
no significantly positive results for smaller transforms, so is not
used there (8x8: 103 -> 101 cycles; 16x16: 302 -> 306 cycles).

Change-Id: I8fca17dc2543fc8eed1dbcd5100145e3c3a9b647
This commit is contained in:
Ronald S. Bultje 2013-07-01 12:03:20 -07:00
parent 5b87240230
commit e5fb4b61b6

View File

@ -15,10 +15,10 @@ pw_1: times 8 dw 1
SECTION .text SECTION .text
%macro QUANTIZE_FN 1 %macro QUANTIZE_FN 2
cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
shift, qcoeff, dqcoeff, dequant, zbin_oq, \ shift, qcoeff, dqcoeff, dequant, zbin_oq, \
eob, scan, iscan eob, scan, iscan
cmp dword skipm, 0 cmp dword skipm, 0
jne .blank jne .blank
@ -43,9 +43,8 @@ cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
mova m4, [r2] ; m4 = shift mova m4, [r2] ; m4 = shift
mov r4, dqcoeffmp mov r4, dqcoeffmp
mov r5, iscanmp mov r5, iscanmp
mov r2, eobmp
pxor m5, m5 ; m5 = dedicated zero pxor m5, m5 ; m5 = dedicated zero
DEFINE_ARGS coeff, ncoeff, eob, qcoeff, dqcoeff, iscan DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
lea coeffq, [ coeffq+ncoeffq*2] lea coeffq, [ coeffq+ncoeffq*2]
lea iscanq, [ iscanq+ncoeffq*2] lea iscanq, [ iscanq+ncoeffq*2]
lea qcoeffq, [ qcoeffq+ncoeffq*2] lea qcoeffq, [ qcoeffq+ncoeffq*2]
@ -119,6 +118,12 @@ cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
%endif %endif
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
%ifidn %1, b_32x32
pmovmskb r6, m7
pmovmskb r2, m12
or r6, r2
jz .skip_iter
%endif
paddw m6, m1 ; m6 += round paddw m6, m1 ; m6 += round
paddw m11, m1 ; m11 += round paddw m11, m1 ; m11 += round
pmulhw m14, m6, m2 ; m14 = m6*q>>16 pmulhw m14, m6, m2 ; m14 = m6*q>>16
@ -159,16 +164,27 @@ cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
pmaxsw m8, m13 pmaxsw m8, m13
add ncoeffq, mmsize add ncoeffq, mmsize
jl .ac_only_loop jl .ac_only_loop
%ifidn %1, b_32x32
jmp .accumulate_eob
.skip_iter:
mova [qcoeffq+ncoeffq*2+ 0], m5
mova [qcoeffq+ncoeffq*2+16], m5
mova [dqcoeffq+ncoeffq*2+ 0], m5
mova [dqcoeffq+ncoeffq*2+16], m5
add ncoeffq, mmsize
jl .ac_only_loop
%endif
.accumulate_eob: .accumulate_eob:
; horizontally accumulate/max eobs and write into [eob] memory pointer ; horizontally accumulate/max eobs and write into [eob] memory pointer
mov r2, eobmp
pshufd m7, m8, 0xe pshufd m7, m8, 0xe
pmaxsw m8, m7 pmaxsw m8, m7
pshuflw m7, m8, 0xe pshuflw m7, m8, 0xe
pmaxsw m8, m7 pmaxsw m8, m7
pshuflw m7, m8, 0x1 pshuflw m7, m8, 0x1
pmaxsw m8, m7 pmaxsw m8, m7
pextrw [eobq], m8, 0 pextrw [r2], m8, 0
RET RET
; skip-block, i.e. just write all zeroes ; skip-block, i.e. just write all zeroes
@ -194,5 +210,5 @@ cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
%endmacro %endmacro
INIT_XMM ssse3 INIT_XMM ssse3
QUANTIZE_FN b QUANTIZE_FN b, 6
QUANTIZE_FN b_32x32 QUANTIZE_FN b_32x32, 7