Merge "block error sse2: use tran_low_t"

This commit is contained in:
Johann Koenig
2017-02-24 05:24:34 +00:00
committed by Gerrit Code Review
2 changed files with 9 additions and 13 deletions

View File

@@ -125,7 +125,7 @@ if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
specialize qw/vp9_block_error avx2/; specialize qw/vp9_block_error avx2 sse2/;
add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
specialize qw/vp9_highbd_block_error sse2/; specialize qw/vp9_highbd_block_error sse2/;

View File

@@ -15,8 +15,6 @@
SECTION .text SECTION .text
%if CONFIG_VP9_HIGHBITDEPTH
%else
; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, ; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
; int64_t *ssz) ; int64_t *ssz)
@@ -25,14 +23,14 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
pxor m4, m4 ; sse accumulator pxor m4, m4 ; sse accumulator
pxor m6, m6 ; ssz accumulator pxor m6, m6 ; ssz accumulator
pxor m5, m5 ; dedicated zero register pxor m5, m5 ; dedicated zero register
lea uqcq, [uqcq+sizeq*2]
lea dqcq, [dqcq+sizeq*2]
neg sizeq
.loop: .loop:
mova m2, [uqcq+sizeq*2] LOAD_TRAN_LOW 2, uqcq, 0
mova m0, [dqcq+sizeq*2] LOAD_TRAN_LOW 0, dqcq, 0
mova m3, [uqcq+sizeq*2+mmsize] LOAD_TRAN_LOW 3, uqcq, 8
mova m1, [dqcq+sizeq*2+mmsize] LOAD_TRAN_LOW 1, dqcq, 8
INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
sub sizeq, 16
psubw m0, m2 psubw m0, m2
psubw m1, m3 psubw m1, m3
; individual errors are max. 15bit+sign, so squares are 30bit, and ; individual errors are max. 15bit+sign, so squares are 30bit, and
@@ -58,8 +56,7 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
punpckhdq m3, m5 punpckhdq m3, m5
paddq m6, m7 paddq m6, m7
paddq m6, m3 paddq m6, m3
add sizeq, mmsize jg .loop
jl .loop
; accumulate horizontally and store in return value ; accumulate horizontally and store in return value
movhlps m5, m4 movhlps m5, m4
@@ -77,7 +74,6 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
movd edx, m5 movd edx, m5
%endif %endif
RET RET
%endif ; CONFIG_VP9_HIGHBITDEPTH
; Compute the sum of squared difference between two tran_low_t vectors. ; Compute the sum of squared difference between two tran_low_t vectors.
; Vectors are converted (if necessary) to int16_t for calculations. ; Vectors are converted (if necessary) to int16_t for calculations.