Use pmovmskb to skip quantize loops over empty coefficients.
If none of the 16 coefficients that we quantize per loop iteration are larger than the zbin, directly skip to the next round of coeffs, rather than doing a full quantize loop that will eventually result in 16 zeroes. This incurs a jump cost, but saves a lot of other work. 32x32 quant goes from 1349 -> 1184 cycles. The same approach yielded no significantly positive results for smaller transforms, so is not used there (8x8: 103 -> 101 cycles; 16x16: 302 -> 306 cycles). Change-Id: I8fca17dc2543fc8eed1dbcd5100145e3c3a9b647
This commit is contained in:
parent
5b87240230
commit
e5fb4b61b6
@ -15,10 +15,10 @@ pw_1: times 8 dw 1
|
|||||||
|
|
||||||
SECTION .text
|
SECTION .text
|
||||||
|
|
||||||
%macro QUANTIZE_FN 1
|
%macro QUANTIZE_FN 2
|
||||||
cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
|
cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
|
||||||
shift, qcoeff, dqcoeff, dequant, zbin_oq, \
|
shift, qcoeff, dqcoeff, dequant, zbin_oq, \
|
||||||
eob, scan, iscan
|
eob, scan, iscan
|
||||||
cmp dword skipm, 0
|
cmp dword skipm, 0
|
||||||
jne .blank
|
jne .blank
|
||||||
|
|
||||||
@ -43,9 +43,8 @@ cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
|
|||||||
mova m4, [r2] ; m4 = shift
|
mova m4, [r2] ; m4 = shift
|
||||||
mov r4, dqcoeffmp
|
mov r4, dqcoeffmp
|
||||||
mov r5, iscanmp
|
mov r5, iscanmp
|
||||||
mov r2, eobmp
|
|
||||||
pxor m5, m5 ; m5 = dedicated zero
|
pxor m5, m5 ; m5 = dedicated zero
|
||||||
DEFINE_ARGS coeff, ncoeff, eob, qcoeff, dqcoeff, iscan
|
DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
|
||||||
lea coeffq, [ coeffq+ncoeffq*2]
|
lea coeffq, [ coeffq+ncoeffq*2]
|
||||||
lea iscanq, [ iscanq+ncoeffq*2]
|
lea iscanq, [ iscanq+ncoeffq*2]
|
||||||
lea qcoeffq, [ qcoeffq+ncoeffq*2]
|
lea qcoeffq, [ qcoeffq+ncoeffq*2]
|
||||||
@ -119,6 +118,12 @@ cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
|
|||||||
%endif
|
%endif
|
||||||
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
|
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
|
||||||
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
|
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
|
||||||
|
%ifidn %1, b_32x32
|
||||||
|
pmovmskb r6, m7
|
||||||
|
pmovmskb r2, m12
|
||||||
|
or r6, r2
|
||||||
|
jz .skip_iter
|
||||||
|
%endif
|
||||||
paddw m6, m1 ; m6 += round
|
paddw m6, m1 ; m6 += round
|
||||||
paddw m11, m1 ; m11 += round
|
paddw m11, m1 ; m11 += round
|
||||||
pmulhw m14, m6, m2 ; m14 = m6*q>>16
|
pmulhw m14, m6, m2 ; m14 = m6*q>>16
|
||||||
@ -159,16 +164,27 @@ cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
|
|||||||
pmaxsw m8, m13
|
pmaxsw m8, m13
|
||||||
add ncoeffq, mmsize
|
add ncoeffq, mmsize
|
||||||
jl .ac_only_loop
|
jl .ac_only_loop
|
||||||
|
%ifidn %1, b_32x32
|
||||||
|
jmp .accumulate_eob
|
||||||
|
.skip_iter:
|
||||||
|
mova [qcoeffq+ncoeffq*2+ 0], m5
|
||||||
|
mova [qcoeffq+ncoeffq*2+16], m5
|
||||||
|
mova [dqcoeffq+ncoeffq*2+ 0], m5
|
||||||
|
mova [dqcoeffq+ncoeffq*2+16], m5
|
||||||
|
add ncoeffq, mmsize
|
||||||
|
jl .ac_only_loop
|
||||||
|
%endif
|
||||||
|
|
||||||
.accumulate_eob:
|
.accumulate_eob:
|
||||||
; horizontally accumulate/max eobs and write into [eob] memory pointer
|
; horizontally accumulate/max eobs and write into [eob] memory pointer
|
||||||
|
mov r2, eobmp
|
||||||
pshufd m7, m8, 0xe
|
pshufd m7, m8, 0xe
|
||||||
pmaxsw m8, m7
|
pmaxsw m8, m7
|
||||||
pshuflw m7, m8, 0xe
|
pshuflw m7, m8, 0xe
|
||||||
pmaxsw m8, m7
|
pmaxsw m8, m7
|
||||||
pshuflw m7, m8, 0x1
|
pshuflw m7, m8, 0x1
|
||||||
pmaxsw m8, m7
|
pmaxsw m8, m7
|
||||||
pextrw [eobq], m8, 0
|
pextrw [r2], m8, 0
|
||||||
RET
|
RET
|
||||||
|
|
||||||
; skip-block, i.e. just write all zeroes
|
; skip-block, i.e. just write all zeroes
|
||||||
@ -194,5 +210,5 @@ cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
|
|||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
INIT_XMM ssse3
|
INIT_XMM ssse3
|
||||||
QUANTIZE_FN b
|
QUANTIZE_FN b, 6
|
||||||
QUANTIZE_FN b_32x32
|
QUANTIZE_FN b_32x32, 7
|
||||||
|
Loading…
Reference in New Issue
Block a user