aa8f85223b
A new version of vp9_highbd_error_8bit is now available which is optimized with AVX assembly. AVX itself does not buy us too much, but the non-destructive 3 operand format encoding of the 128bit SSEn integer instructions helps to eliminate move instructions. The Sandy Bridge micro-architecture cannot eliminate move instructions in the processor front end, so AVX will help on these machines. Further 2 optimizations are applied: 1. The common case of computing block error on 4x4 blocks is optimized as a special case. 2. All arithmetic is speculatively done on 32 bits only. At the end of the loop, the code detects if overflow might have happened and if so, the whole computation is re-executed using higher precision arithmetic. This case however is extremely rare in real use, so we can achieve a large net gain here. The optimizations rely on the fact that the coefficients are in the range [-(2^15-1), 2^15-1], and that the quantized coefficients always have the same sign as the input coefficients (in the worst case they are 0). These are the same assumptions that the old SSE2 assembly code for the non high bitdepth configuration relied on. The unit tests have been updated to take this constraint into consideration when generating test input data. Change-Id: I57d9888a74715e7145a5d9987d67891ef68f39b7
262 lines
7.0 KiB
NASM
262 lines
7.0 KiB
NASM
;
|
|
; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
|
;
|
|
; Use of this source code is governed by a BSD-style license
|
|
; that can be found in the LICENSE file in the root of the source
|
|
; tree. An additional intellectual property rights grant can be found
|
|
; in the file PATENTS. All contributing project authors may
|
|
; be found in the AUTHORS file in the root of the source tree.
|
|
;
|
|
|
|
%define private_prefix vp9
|
|
|
|
%include "third_party/x86inc/x86inc.asm"
|
|
|
|
SECTION .text
|
|
ALIGN 16
|
|
|
|
;
|
|
; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
|
|
; intptr_t block_size, int64_t *ssz)
|
|
;
|
|
|
|
INIT_XMM avx
|
|
cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz
|
|
vzeroupper
|
|
|
|
; If only one iteration is required, then handle this as a special case.
|
|
; It is the most frequent case, so we can have a significant gain here
|
|
; by not setting up a loop and accumulators.
|
|
cmp sizeq, 16
|
|
jne .generic
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; Common case of size == 16
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
; Load input vectors
|
|
mova xm0, [dqcq]
|
|
packssdw xm0, [dqcq+16]
|
|
mova xm2, [uqcq]
|
|
packssdw xm2, [uqcq+16]
|
|
|
|
mova xm1, [dqcq+32]
|
|
packssdw xm1, [dqcq+48]
|
|
mova xm3, [uqcq+32]
|
|
packssdw xm3, [uqcq+48]
|
|
|
|
; Compute the errors.
|
|
psubw xm0, xm2
|
|
psubw xm1, xm3
|
|
|
|
; Individual errors are max 15bit+sign, so squares are 30bit, and
|
|
; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
|
|
pmaddwd xm2, xm2
|
|
pmaddwd xm3, xm3
|
|
|
|
pmaddwd xm0, xm0
|
|
pmaddwd xm1, xm1
|
|
|
|
; Squares are always positive, so we can use unsigned arithmetic after
|
|
; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
|
|
; fit in 32bits
|
|
paddd xm2, xm3
|
|
paddd xm0, xm1
|
|
|
|
; Accumulate horizontally in 64 bits, there is no chance of overflow here
|
|
pxor xm5, xm5
|
|
|
|
pblendw xm3, xm5, xm2, 0x33 ; Zero extended low of a pair of 32 bits
|
|
psrlq xm2, 32 ; Zero extended high of a pair of 32 bits
|
|
|
|
pblendw xm1, xm5, xm0, 0x33 ; Zero extended low of a pair of 32 bits
|
|
psrlq xm0, 32 ; Zero extended high of a pair of 32 bits
|
|
|
|
paddq xm2, xm3
|
|
paddq xm0, xm1
|
|
|
|
psrldq xm3, xm2, 8
|
|
psrldq xm1, xm0, 8
|
|
|
|
paddq xm2, xm3
|
|
paddq xm0, xm1
|
|
|
|
; Store the return value
|
|
%if ARCH_X86_64
|
|
movq rax, xm0
|
|
movq [sszq], xm2
|
|
%else
|
|
movd eax, xm0
|
|
pextrd edx, xm0, 1
|
|
movq [sszd], xm2
|
|
%endif
|
|
RET
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; Generic case of size != 16, speculative low precision
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
ALIGN 16
|
|
.generic:
|
|
pxor xm4, xm4 ; sse accumulator
|
|
pxor xm5, xm5 ; overflow detection register for xm4
|
|
pxor xm6, xm6 ; ssz accumulator
|
|
pxor xm7, xm7 ; overflow detection register for xm6
|
|
lea uqcq, [uqcq+sizeq*4]
|
|
lea dqcq, [dqcq+sizeq*4]
|
|
neg sizeq
|
|
|
|
; Push the negative size as the high precision code might need it
|
|
push sizeq
|
|
|
|
.loop:
|
|
; Load input vectors
|
|
mova xm0, [dqcq+sizeq*4]
|
|
packssdw xm0, [dqcq+sizeq*4+16]
|
|
mova xm2, [uqcq+sizeq*4]
|
|
packssdw xm2, [uqcq+sizeq*4+16]
|
|
|
|
mova xm1, [dqcq+sizeq*4+32]
|
|
packssdw xm1, [dqcq+sizeq*4+48]
|
|
mova xm3, [uqcq+sizeq*4+32]
|
|
packssdw xm3, [uqcq+sizeq*4+48]
|
|
|
|
add sizeq, 16
|
|
|
|
; Compute the squared errors.
|
|
; Individual errors are max 15bit+sign, so squares are 30bit, and
|
|
; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
|
|
psubw xm0, xm2
|
|
pmaddwd xm2, xm2
|
|
pmaddwd xm0, xm0
|
|
|
|
psubw xm1, xm3
|
|
pmaddwd xm3, xm3
|
|
pmaddwd xm1, xm1
|
|
|
|
; Squares are always positive, so we can use unsigned arithmetic after
|
|
; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
|
|
; fit in 32bits
|
|
paddd xm2, xm3
|
|
paddd xm0, xm1
|
|
|
|
; We accumulate using 32 bit arithmetic, but detect potential overflow
|
|
; by checking if the MSB of the accumulators have ever been a set bit.
|
|
; If yes, we redo the whole compute at the end on higher precision, but
|
|
; this happens extremely rarely, so we still achieve a net gain.
|
|
paddd xm4, xm0
|
|
paddd xm6, xm2
|
|
por xm5, xm4 ; OR in the accumulator for overflow detection
|
|
por xm7, xm6 ; OR in the accumulator for overflow detection
|
|
|
|
jnz .loop
|
|
|
|
; Add pairs horizontally (still only on 32 bits)
|
|
phaddd xm4, xm4
|
|
por xm5, xm4 ; OR in the accumulator for overflow detection
|
|
phaddd xm6, xm6
|
|
por xm7, xm6 ; OR in the accumulator for overflow detection
|
|
|
|
; Check for possibility of overflow by testing if bit 32 of each dword lane
|
|
; have ever been set. If they were not, then there was no overflow and the
|
|
; final sum will fit in 32 bits. If overflow happened, then
|
|
; we redo the whole computation on higher precision.
|
|
por xm7, xm5
|
|
pmovmskb r4, xm7
|
|
test r4, 0x8888
|
|
jnz .highprec
|
|
|
|
phaddd xm4, xm4
|
|
phaddd xm6, xm6
|
|
pmovzxdq xm4, xm4
|
|
pmovzxdq xm6, xm6
|
|
|
|
; Restore stack
|
|
pop sizeq
|
|
|
|
; Store the return value
|
|
%if ARCH_X86_64
|
|
movq rax, xm4
|
|
movq [sszq], xm6
|
|
%else
|
|
movd eax, xm4
|
|
pextrd edx, xm4, 1
|
|
movq [sszd], xm6
|
|
%endif
|
|
RET
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; Generic case of size != 16, high precision case
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
.highprec:
|
|
pxor xm4, xm4 ; sse accumulator
|
|
pxor xm5, xm5 ; dedicated zero register
|
|
pxor xm6, xm6 ; ssz accumulator
|
|
pop sizeq
|
|
|
|
.loophp:
|
|
mova xm0, [dqcq+sizeq*4]
|
|
packssdw xm0, [dqcq+sizeq*4+16]
|
|
mova xm2, [uqcq+sizeq*4]
|
|
packssdw xm2, [uqcq+sizeq*4+16]
|
|
|
|
mova xm1, [dqcq+sizeq*4+32]
|
|
packssdw xm1, [dqcq+sizeq*4+48]
|
|
mova xm3, [uqcq+sizeq*4+32]
|
|
packssdw xm3, [uqcq+sizeq*4+48]
|
|
|
|
add sizeq, 16
|
|
|
|
; individual errors are max. 15bit+sign, so squares are 30bit, and
|
|
; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
|
|
|
|
psubw xm0, xm2
|
|
pmaddwd xm2, xm2
|
|
pmaddwd xm0, xm0
|
|
|
|
psubw xm1, xm3
|
|
pmaddwd xm3, xm3
|
|
pmaddwd xm1, xm1
|
|
|
|
; accumulate in 64bit
|
|
punpckldq xm7, xm0, xm5
|
|
punpckhdq xm0, xm5
|
|
paddq xm4, xm7
|
|
|
|
punpckldq xm7, xm2, xm5
|
|
punpckhdq xm2, xm5
|
|
paddq xm6, xm7
|
|
|
|
punpckldq xm7, xm1, xm5
|
|
punpckhdq xm1, xm5
|
|
paddq xm4, xm7
|
|
|
|
punpckldq xm7, xm3, xm5
|
|
punpckhdq xm3, xm5
|
|
paddq xm6, xm7
|
|
|
|
paddq xm4, xm0
|
|
paddq xm4, xm1
|
|
paddq xm6, xm2
|
|
paddq xm6, xm3
|
|
|
|
jnz .loophp
|
|
|
|
; Accumulate horizontally
|
|
movhlps xm5, xm4
|
|
movhlps xm7, xm6
|
|
paddq xm4, xm5
|
|
paddq xm6, xm7
|
|
|
|
; Store the return value
|
|
%if ARCH_X86_64
|
|
movq rax, xm4
|
|
movq [sszq], xm6
|
|
%else
|
|
movd eax, xm4
|
|
pextrd edx, xm4, 1
|
|
movq [sszd], xm6
|
|
%endif
|
|
RET
|
|
|
|
END
|