SSSE3 assembly implementation of 8x8 Hadamard transform
It uses about 10% less CPU cycles than the SSE2 intrinsic implementation. Change-Id: I91017c0c068679a214b98cdd4cff3a6facfb7499
This commit is contained in:
parent
9922e4344a
commit
7f629dfca4
@ -1110,7 +1110,7 @@ add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
|
||||
specialize qw/vp9_avg_4x4 sse2/;
|
||||
|
||||
add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
|
||||
specialize qw/vp9_hadamard_8x8 sse2/;
|
||||
specialize qw/vp9_hadamard_8x8 sse2/, "$ssse3_x86_64";
|
||||
|
||||
add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
|
||||
specialize qw/vp9_hadamard_16x16 sse2/;
|
||||
|
@ -178,5 +178,78 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
|
||||
mova [outputq + 96], m6
|
||||
mova [outputq + 112], m7
|
||||
|
||||
RET
|
||||
|
||||
%macro HMD8_1D 0
|
||||
psubw m8, m0, m1
|
||||
psubw m9, m2, m3
|
||||
paddw m0, m1
|
||||
paddw m2, m3
|
||||
SWAP 1, 8
|
||||
SWAP 3, 9
|
||||
psubw m8, m4, m5
|
||||
psubw m9, m6, m7
|
||||
paddw m4, m5
|
||||
paddw m6, m7
|
||||
SWAP 5, 8
|
||||
SWAP 7, 9
|
||||
|
||||
psubw m8, m0, m2
|
||||
psubw m9, m1, m3
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
SWAP 2, 8
|
||||
SWAP 3, 9
|
||||
psubw m8, m4, m6
|
||||
psubw m9, m5, m7
|
||||
paddw m4, m6
|
||||
paddw m5, m7
|
||||
SWAP 6, 8
|
||||
SWAP 7, 9
|
||||
|
||||
psubw m8, m0, m4
|
||||
psubw m9, m1, m5
|
||||
paddw m0, m4
|
||||
paddw m1, m5
|
||||
SWAP 4, 8
|
||||
SWAP 5, 9
|
||||
psubw m8, m2, m6
|
||||
psubw m9, m3, m7
|
||||
paddw m2, m6
|
||||
paddw m3, m7
|
||||
SWAP 6, 8
|
||||
SWAP 7, 9
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
cglobal hadamard_8x8, 3, 5, 10, input, stride, output
|
||||
lea r3, [2 * strideq]
|
||||
lea r4, [4 * strideq]
|
||||
|
||||
mova m0, [inputq]
|
||||
mova m1, [inputq + r3]
|
||||
lea inputq, [inputq + r4]
|
||||
mova m2, [inputq]
|
||||
mova m3, [inputq + r3]
|
||||
lea inputq, [inputq + r4]
|
||||
mova m4, [inputq]
|
||||
mova m5, [inputq + r3]
|
||||
lea inputq, [inputq + r4]
|
||||
mova m6, [inputq]
|
||||
mova m7, [inputq + r3]
|
||||
|
||||
HMD8_1D
|
||||
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
|
||||
HMD8_1D
|
||||
|
||||
mova [outputq + 0], m0
|
||||
mova [outputq + 16], m1
|
||||
mova [outputq + 32], m2
|
||||
mova [outputq + 48], m3
|
||||
mova [outputq + 64], m4
|
||||
mova [outputq + 80], m5
|
||||
mova [outputq + 96], m6
|
||||
mova [outputq + 112], m7
|
||||
|
||||
RET
|
||||
%endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user