58e0159c80
In ssse3 functions, DEFINE_ARGS macro hard codes qcoeff and dqcoeff to r3 and r4. If skip is 1, qcoeff and dqcoeff need to be loaded from the stack, which doesn't work because of the above definitions. Currently, skip=1 case is not used in the encoder. This patch fixed the issue, so it can be turned on later. Change-Id: I998d696b1a7a85dca2b3bcee790b21c21e039147
400 lines
18 KiB
NASM
400 lines
18 KiB
NASM
;
|
|
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
|
;
|
|
; Use of this source code is governed by a BSD-style license
|
|
; that can be found in the LICENSE file in the root of the source
|
|
; tree. An additional intellectual property rights grant can be found
|
|
; in the file PATENTS. All contributing project authors may
|
|
; be found in the AUTHORS file in the root of the source tree.
|
|
;
|
|
|
|
%include "third_party/x86inc/x86inc.asm"
|
|
|
|
SECTION_RODATA
|
|
pw_1: times 8 dw 1
|
|
|
|
SECTION .text
|
|
|
|
; TODO(yunqingwang)fix quantize_b code for skip=1 case.
|
|
%macro QUANTIZE_FN 2
|
|
cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
|
|
shift, qcoeff, dqcoeff, dequant, \
|
|
eob, scan, iscan
|
|
cmp dword skipm, 0
|
|
jne .blank
|
|
|
|
; actual quantize loop - setup pointers, rounders, etc.
|
|
movifnidn coeffq, coeffmp
|
|
movifnidn ncoeffq, ncoeffmp
|
|
mov r2, dequantmp
|
|
movifnidn zbinq, zbinmp
|
|
movifnidn roundq, roundmp
|
|
movifnidn quantq, quantmp
|
|
mova m0, [zbinq] ; m0 = zbin
|
|
mova m1, [roundq] ; m1 = round
|
|
mova m2, [quantq] ; m2 = quant
|
|
%ifidn %1, b_32x32
|
|
pcmpeqw m5, m5
|
|
psrlw m5, 15
|
|
paddw m0, m5
|
|
paddw m1, m5
|
|
psrlw m0, 1 ; m0 = (m0 + 1) / 2
|
|
psrlw m1, 1 ; m1 = (m1 + 1) / 2
|
|
%endif
|
|
mova m3, [r2q] ; m3 = dequant
|
|
psubw m0, [pw_1]
|
|
mov r2, shiftmp
|
|
mov r3, qcoeffmp
|
|
mova m4, [r2] ; m4 = shift
|
|
mov r4, dqcoeffmp
|
|
mov r5, iscanmp
|
|
%ifidn %1, b_32x32
|
|
psllw m4, 1
|
|
%endif
|
|
pxor m5, m5 ; m5 = dedicated zero
|
|
DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
|
|
lea coeffq, [ coeffq+ncoeffq*2]
|
|
lea iscanq, [ iscanq+ncoeffq*2]
|
|
lea qcoeffq, [ qcoeffq+ncoeffq*2]
|
|
lea dqcoeffq, [dqcoeffq+ncoeffq*2]
|
|
neg ncoeffq
|
|
|
|
; get DC and first 15 AC coeffs
|
|
mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
|
|
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
|
|
pabsw m6, m9 ; m6 = abs(m9)
|
|
pabsw m11, m10 ; m11 = abs(m10)
|
|
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
|
|
punpckhqdq m0, m0
|
|
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
|
|
paddsw m6, m1 ; m6 += round
|
|
punpckhqdq m1, m1
|
|
paddsw m11, m1 ; m11 += round
|
|
pmulhw m8, m6, m2 ; m8 = m6*q>>16
|
|
punpckhqdq m2, m2
|
|
pmulhw m13, m11, m2 ; m13 = m11*q>>16
|
|
paddw m8, m6 ; m8 += m6
|
|
paddw m13, m11 ; m13 += m11
|
|
pmulhw m8, m4 ; m8 = m8*qsh>>16
|
|
punpckhqdq m4, m4
|
|
pmulhw m13, m4 ; m13 = m13*qsh>>16
|
|
psignw m8, m9 ; m8 = reinsert sign
|
|
psignw m13, m10 ; m13 = reinsert sign
|
|
pand m8, m7
|
|
pand m13, m12
|
|
mova [qcoeffq+ncoeffq*2+ 0], m8
|
|
mova [qcoeffq+ncoeffq*2+16], m13
|
|
%ifidn %1, b_32x32
|
|
pabsw m8, m8
|
|
pabsw m13, m13
|
|
%endif
|
|
pmullw m8, m3 ; dqc[i] = qc[i] * q
|
|
punpckhqdq m3, m3
|
|
pmullw m13, m3 ; dqc[i] = qc[i] * q
|
|
%ifidn %1, b_32x32
|
|
psrlw m8, 1
|
|
psrlw m13, 1
|
|
psignw m8, m9
|
|
psignw m13, m10
|
|
%endif
|
|
mova [dqcoeffq+ncoeffq*2+ 0], m8
|
|
mova [dqcoeffq+ncoeffq*2+16], m13
|
|
pcmpeqw m8, m5 ; m8 = c[i] == 0
|
|
pcmpeqw m13, m5 ; m13 = c[i] == 0
|
|
mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
|
|
mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
|
|
psubw m6, m7 ; m6 = scan[i] + 1
|
|
psubw m11, m12 ; m11 = scan[i] + 1
|
|
pandn m8, m6 ; m8 = max(eob)
|
|
pandn m13, m11 ; m13 = max(eob)
|
|
pmaxsw m8, m13
|
|
add ncoeffq, mmsize
|
|
jz .accumulate_eob
|
|
|
|
.ac_only_loop:
|
|
mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
|
|
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
|
|
pabsw m6, m9 ; m6 = abs(m9)
|
|
pabsw m11, m10 ; m11 = abs(m10)
|
|
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
|
|
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
|
|
%ifidn %1, b_32x32
|
|
pmovmskb r6d, m7
|
|
pmovmskb r2d, m12
|
|
or r6, r2
|
|
jz .skip_iter
|
|
%endif
|
|
paddsw m6, m1 ; m6 += round
|
|
paddsw m11, m1 ; m11 += round
|
|
pmulhw m14, m6, m2 ; m14 = m6*q>>16
|
|
pmulhw m13, m11, m2 ; m13 = m11*q>>16
|
|
paddw m14, m6 ; m14 += m6
|
|
paddw m13, m11 ; m13 += m11
|
|
pmulhw m14, m4 ; m14 = m14*qsh>>16
|
|
pmulhw m13, m4 ; m13 = m13*qsh>>16
|
|
psignw m14, m9 ; m14 = reinsert sign
|
|
psignw m13, m10 ; m13 = reinsert sign
|
|
pand m14, m7
|
|
pand m13, m12
|
|
mova [qcoeffq+ncoeffq*2+ 0], m14
|
|
mova [qcoeffq+ncoeffq*2+16], m13
|
|
%ifidn %1, b_32x32
|
|
pabsw m14, m14
|
|
pabsw m13, m13
|
|
%endif
|
|
pmullw m14, m3 ; dqc[i] = qc[i] * q
|
|
pmullw m13, m3 ; dqc[i] = qc[i] * q
|
|
%ifidn %1, b_32x32
|
|
psrlw m14, 1
|
|
psrlw m13, 1
|
|
psignw m14, m9
|
|
psignw m13, m10
|
|
%endif
|
|
mova [dqcoeffq+ncoeffq*2+ 0], m14
|
|
mova [dqcoeffq+ncoeffq*2+16], m13
|
|
pcmpeqw m14, m5 ; m14 = c[i] == 0
|
|
pcmpeqw m13, m5 ; m13 = c[i] == 0
|
|
mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
|
|
mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
|
|
psubw m6, m7 ; m6 = scan[i] + 1
|
|
psubw m11, m12 ; m11 = scan[i] + 1
|
|
pandn m14, m6 ; m14 = max(eob)
|
|
pandn m13, m11 ; m13 = max(eob)
|
|
pmaxsw m8, m14
|
|
pmaxsw m8, m13
|
|
add ncoeffq, mmsize
|
|
jl .ac_only_loop
|
|
|
|
%ifidn %1, b_32x32
|
|
jmp .accumulate_eob
|
|
.skip_iter:
|
|
mova [qcoeffq+ncoeffq*2+ 0], m5
|
|
mova [qcoeffq+ncoeffq*2+16], m5
|
|
mova [dqcoeffq+ncoeffq*2+ 0], m5
|
|
mova [dqcoeffq+ncoeffq*2+16], m5
|
|
add ncoeffq, mmsize
|
|
jl .ac_only_loop
|
|
%endif
|
|
|
|
.accumulate_eob:
|
|
; horizontally accumulate/max eobs and write into [eob] memory pointer
|
|
mov r2, eobmp
|
|
pshufd m7, m8, 0xe
|
|
pmaxsw m8, m7
|
|
pshuflw m7, m8, 0xe
|
|
pmaxsw m8, m7
|
|
pshuflw m7, m8, 0x1
|
|
pmaxsw m8, m7
|
|
pextrw r6, m8, 0
|
|
mov [r2], r6
|
|
RET
|
|
|
|
; skip-block, i.e. just write all zeroes
|
|
.blank:
|
|
mov r0, dqcoeffmp
|
|
movifnidn ncoeffq, ncoeffmp
|
|
mov r2, qcoeffmp
|
|
mov r3, eobmp
|
|
DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
|
|
lea dqcoeffq, [dqcoeffq+ncoeffq*2]
|
|
lea qcoeffq, [ qcoeffq+ncoeffq*2]
|
|
neg ncoeffq
|
|
pxor m7, m7
|
|
.blank_loop:
|
|
mova [dqcoeffq+ncoeffq*2+ 0], m7
|
|
mova [dqcoeffq+ncoeffq*2+16], m7
|
|
mova [qcoeffq+ncoeffq*2+ 0], m7
|
|
mova [qcoeffq+ncoeffq*2+16], m7
|
|
add ncoeffq, mmsize
|
|
jl .blank_loop
|
|
mov word [eobq], 0
|
|
RET
|
|
%endmacro
|
|
|
|
INIT_XMM ssse3
|
|
QUANTIZE_FN b, 7
|
|
QUANTIZE_FN b_32x32, 7
|
|
|
|
%macro QUANTIZE_FP 2
|
|
cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
|
|
shift, qcoeff, dqcoeff, dequant, \
|
|
eob, scan, iscan
|
|
cmp dword skipm, 0
|
|
jne .blank
|
|
|
|
; actual quantize loop - setup pointers, rounders, etc.
|
|
movifnidn coeffq, coeffmp
|
|
movifnidn ncoeffq, ncoeffmp
|
|
mov r2, dequantmp
|
|
movifnidn zbinq, zbinmp
|
|
movifnidn roundq, roundmp
|
|
movifnidn quantq, quantmp
|
|
mova m1, [roundq] ; m1 = round
|
|
mova m2, [quantq] ; m2 = quant
|
|
%ifidn %1, fp_32x32
|
|
pcmpeqw m5, m5
|
|
psrlw m5, 15
|
|
paddw m1, m5
|
|
psrlw m1, 1 ; m1 = (m1 + 1) / 2
|
|
%endif
|
|
mova m3, [r2q] ; m3 = dequant
|
|
mov r3, qcoeffmp
|
|
mov r4, dqcoeffmp
|
|
mov r5, iscanmp
|
|
%ifidn %1, fp_32x32
|
|
psllw m2, 1
|
|
%endif
|
|
pxor m5, m5 ; m5 = dedicated zero
|
|
|
|
lea coeffq, [ coeffq+ncoeffq*2]
|
|
lea r5q, [ r5q+ncoeffq*2]
|
|
lea r3q, [ r3q+ncoeffq*2]
|
|
lea r4q, [r4q+ncoeffq*2]
|
|
neg ncoeffq
|
|
|
|
; get DC and first 15 AC coeffs
|
|
mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
|
|
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
|
|
pabsw m6, m9 ; m6 = abs(m9)
|
|
pabsw m11, m10 ; m11 = abs(m10)
|
|
pcmpeqw m7, m7
|
|
|
|
paddsw m6, m1 ; m6 += round
|
|
punpckhqdq m1, m1
|
|
paddsw m11, m1 ; m11 += round
|
|
pmulhw m8, m6, m2 ; m8 = m6*q>>16
|
|
punpckhqdq m2, m2
|
|
pmulhw m13, m11, m2 ; m13 = m11*q>>16
|
|
psignw m8, m9 ; m8 = reinsert sign
|
|
psignw m13, m10 ; m13 = reinsert sign
|
|
mova [r3q+ncoeffq*2+ 0], m8
|
|
mova [r3q+ncoeffq*2+16], m13
|
|
%ifidn %1, fp_32x32
|
|
pabsw m8, m8
|
|
pabsw m13, m13
|
|
%endif
|
|
pmullw m8, m3 ; r4[i] = r3[i] * q
|
|
punpckhqdq m3, m3
|
|
pmullw m13, m3 ; r4[i] = r3[i] * q
|
|
%ifidn %1, fp_32x32
|
|
psrlw m8, 1
|
|
psrlw m13, 1
|
|
psignw m8, m9
|
|
psignw m13, m10
|
|
psrlw m0, m3, 2
|
|
%endif
|
|
mova [r4q+ncoeffq*2+ 0], m8
|
|
mova [r4q+ncoeffq*2+16], m13
|
|
pcmpeqw m8, m5 ; m8 = c[i] == 0
|
|
pcmpeqw m13, m5 ; m13 = c[i] == 0
|
|
mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
|
|
mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
|
|
psubw m6, m7 ; m6 = scan[i] + 1
|
|
psubw m11, m7 ; m11 = scan[i] + 1
|
|
pandn m8, m6 ; m8 = max(eob)
|
|
pandn m13, m11 ; m13 = max(eob)
|
|
pmaxsw m8, m13
|
|
add ncoeffq, mmsize
|
|
jz .accumulate_eob
|
|
|
|
.ac_only_loop:
|
|
mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
|
|
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
|
|
pabsw m6, m9 ; m6 = abs(m9)
|
|
pabsw m11, m10 ; m11 = abs(m10)
|
|
%ifidn %1, fp_32x32
|
|
pcmpgtw m7, m6, m0
|
|
pcmpgtw m12, m11, m0
|
|
pmovmskb r6d, m7
|
|
pmovmskb r2d, m12
|
|
|
|
or r6, r2
|
|
jz .skip_iter
|
|
%endif
|
|
pcmpeqw m7, m7
|
|
|
|
paddsw m6, m1 ; m6 += round
|
|
paddsw m11, m1 ; m11 += round
|
|
pmulhw m14, m6, m2 ; m14 = m6*q>>16
|
|
pmulhw m13, m11, m2 ; m13 = m11*q>>16
|
|
psignw m14, m9 ; m14 = reinsert sign
|
|
psignw m13, m10 ; m13 = reinsert sign
|
|
mova [r3q+ncoeffq*2+ 0], m14
|
|
mova [r3q+ncoeffq*2+16], m13
|
|
%ifidn %1, fp_32x32
|
|
pabsw m14, m14
|
|
pabsw m13, m13
|
|
%endif
|
|
pmullw m14, m3 ; r4[i] = r3[i] * q
|
|
pmullw m13, m3 ; r4[i] = r3[i] * q
|
|
%ifidn %1, fp_32x32
|
|
psrlw m14, 1
|
|
psrlw m13, 1
|
|
psignw m14, m9
|
|
psignw m13, m10
|
|
%endif
|
|
mova [r4q+ncoeffq*2+ 0], m14
|
|
mova [r4q+ncoeffq*2+16], m13
|
|
pcmpeqw m14, m5 ; m14 = c[i] == 0
|
|
pcmpeqw m13, m5 ; m13 = c[i] == 0
|
|
mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
|
|
mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
|
|
psubw m6, m7 ; m6 = scan[i] + 1
|
|
psubw m11, m7 ; m11 = scan[i] + 1
|
|
pandn m14, m6 ; m14 = max(eob)
|
|
pandn m13, m11 ; m13 = max(eob)
|
|
pmaxsw m8, m14
|
|
pmaxsw m8, m13
|
|
add ncoeffq, mmsize
|
|
jl .ac_only_loop
|
|
|
|
%ifidn %1, fp_32x32
|
|
jmp .accumulate_eob
|
|
.skip_iter:
|
|
mova [r3q+ncoeffq*2+ 0], m5
|
|
mova [r3q+ncoeffq*2+16], m5
|
|
mova [r4q+ncoeffq*2+ 0], m5
|
|
mova [r4q+ncoeffq*2+16], m5
|
|
add ncoeffq, mmsize
|
|
jl .ac_only_loop
|
|
%endif
|
|
|
|
.accumulate_eob:
|
|
; horizontally accumulate/max eobs and write into [eob] memory pointer
|
|
mov r2, eobmp
|
|
pshufd m7, m8, 0xe
|
|
pmaxsw m8, m7
|
|
pshuflw m7, m8, 0xe
|
|
pmaxsw m8, m7
|
|
pshuflw m7, m8, 0x1
|
|
pmaxsw m8, m7
|
|
pextrw r6, m8, 0
|
|
mov [r2], r6
|
|
RET
|
|
|
|
; skip-block, i.e. just write all zeroes
|
|
.blank:
|
|
mov r0, dqcoeffmp
|
|
movifnidn ncoeffq, ncoeffmp
|
|
mov r2, qcoeffmp
|
|
mov r3, eobmp
|
|
|
|
lea r0q, [r0q+ncoeffq*2]
|
|
lea r2q, [r2q+ncoeffq*2]
|
|
neg ncoeffq
|
|
pxor m7, m7
|
|
.blank_loop:
|
|
mova [r0q+ncoeffq*2+ 0], m7
|
|
mova [r0q+ncoeffq*2+16], m7
|
|
mova [r2q+ncoeffq*2+ 0], m7
|
|
mova [r2q+ncoeffq*2+16], m7
|
|
add ncoeffq, mmsize
|
|
jl .blank_loop
|
|
mov word [r3q], 0
|
|
RET
|
|
%endmacro
|
|
|
|
INIT_XMM ssse3
|
|
QUANTIZE_FP fp, 7
|
|
QUANTIZE_FP fp_32x32, 7
|