SSSE3 8x8 inverse 2D-DCT with first 10 coeffs non-zero
This commit enables ssse3 assembly implementation of the 8x8 inverse 2D-DCT with only first 10 coefficients non-zero. The average runtime for this unit goes down from 198 cycles to 129 cycles (34.8% faster). Change-Id: Ie7fa4386f6d3a2fe0d47a2eb26fc2a6bbc592ac7
This commit is contained in:
parent
928ff03889
commit
9e7b09bc5d
@ -313,7 +313,7 @@ add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int
|
||||
specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/, "$ssse3_x86_64";
|
||||
|
||||
add_proto qw/void vp9_idct8x8_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
|
||||
specialize qw/vp9_idct8x8_10_add sse2 neon dspr2/;
|
||||
specialize qw/vp9_idct8x8_10_add sse2 neon dspr2/, "$ssse3_x86_64";
|
||||
|
||||
add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
|
||||
specialize qw/vp9_idct16x16_1_add sse2 neon dspr2/;
|
||||
|
@ -28,6 +28,29 @@ TRANSFORM_COEFFS 6270, 15137
|
||||
TRANSFORM_COEFFS 3196, 16069
|
||||
TRANSFORM_COEFFS 13623, 9102
|
||||
|
||||
%macro PAIR_PP_COEFFS 2
|
||||
dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2
|
||||
%endmacro
|
||||
|
||||
%macro PAIR_MP_COEFFS 2
|
||||
dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2
|
||||
%endmacro
|
||||
|
||||
%macro PAIR_MM_COEFFS 2
|
||||
dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
|
||||
%endmacro
|
||||
|
||||
PAIR_PP_COEFFS 30274, 12540
|
||||
PAIR_PP_COEFFS 6392, 32138
|
||||
PAIR_MP_COEFFS 18204, 27246
|
||||
|
||||
PAIR_PP_COEFFS 12540, 12540
|
||||
PAIR_PP_COEFFS 30274, 30274
|
||||
PAIR_PP_COEFFS 6392, 6392
|
||||
PAIR_PP_COEFFS 32138, 32138
|
||||
PAIR_MM_COEFFS 18204, 18204
|
||||
PAIR_PP_COEFFS 27246, 27246
|
||||
|
||||
SECTION .text
|
||||
|
||||
%if ARCH_X86_64
|
||||
@ -128,6 +151,7 @@ SECTION .text
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
; full inverse 8x8 2D-DCT transform
|
||||
cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
|
||||
mova m8, [pd_8192]
|
||||
mova m11, [pw_16]
|
||||
@ -159,4 +183,118 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
|
||||
ADD_STORE_8P_2X 6, 7, 9, 10, 12
|
||||
|
||||
RET
|
||||
|
||||
; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
|
||||
cglobal idct8x8_10_add, 3, 5, 13, input, output, stride
|
||||
mova m8, [pd_8192]
|
||||
mova m11, [pw_16]
|
||||
mova m12, [pw_11585x2]
|
||||
|
||||
lea r3, [2 * strideq]
|
||||
|
||||
mova m0, [inputq + 0]
|
||||
mova m1, [inputq + 16]
|
||||
mova m2, [inputq + 32]
|
||||
mova m3, [inputq + 48]
|
||||
|
||||
punpcklwd m0, m1
|
||||
punpcklwd m2, m3
|
||||
punpckhdq m9, m0, m2
|
||||
punpckldq m0, m2
|
||||
SWAP 2, 9
|
||||
|
||||
; m0 -> [0], [0]
|
||||
; m1 -> [1], [1]
|
||||
; m2 -> [2], [2]
|
||||
; m3 -> [3], [3]
|
||||
punpckhqdq m10, m0, m0
|
||||
punpcklqdq m0, m0
|
||||
punpckhqdq m9, m2, m2
|
||||
punpcklqdq m2, m2
|
||||
SWAP 1, 10
|
||||
SWAP 3, 9
|
||||
|
||||
pmulhrsw m0, m12
|
||||
pmulhrsw m2, [dpw_30274_12540]
|
||||
pmulhrsw m1, [dpw_6392_32138]
|
||||
pmulhrsw m3, [dpw_m18204_27246]
|
||||
|
||||
SUM_SUB 0, 2, 9
|
||||
SUM_SUB 1, 3, 9
|
||||
|
||||
punpcklqdq m9, m3, m3
|
||||
punpckhqdq m5, m3, m9
|
||||
|
||||
SUM_SUB 3, 5, 9
|
||||
punpckhqdq m5, m3
|
||||
pmulhrsw m5, m12
|
||||
|
||||
punpckhqdq m9, m1, m5
|
||||
punpcklqdq m1, m5
|
||||
SWAP 5, 9
|
||||
|
||||
SUM_SUB 0, 5, 9
|
||||
SUM_SUB 2, 1, 9
|
||||
|
||||
punpckhqdq m3, m0, m0
|
||||
punpckhqdq m4, m1, m1
|
||||
punpckhqdq m6, m5, m5
|
||||
punpckhqdq m7, m2, m2
|
||||
|
||||
punpcklwd m0, m3
|
||||
punpcklwd m7, m2
|
||||
punpcklwd m1, m4
|
||||
punpcklwd m6, m5
|
||||
|
||||
punpckhdq m4, m0, m7
|
||||
punpckldq m0, m7
|
||||
punpckhdq m10, m1, m6
|
||||
punpckldq m5, m1, m6
|
||||
|
||||
punpckhqdq m1, m0, m5
|
||||
punpcklqdq m0, m5
|
||||
punpckhqdq m3, m4, m10
|
||||
punpcklqdq m2, m4, m10
|
||||
|
||||
|
||||
pmulhrsw m0, m12
|
||||
pmulhrsw m6, m2, [dpw_30274_30274]
|
||||
pmulhrsw m4, m2, [dpw_12540_12540]
|
||||
|
||||
pmulhrsw m7, m1, [dpw_32138_32138]
|
||||
pmulhrsw m1, [dpw_6392_6392]
|
||||
pmulhrsw m5, m3, [dpw_m18204_m18204]
|
||||
pmulhrsw m3, [dpw_27246_27246]
|
||||
|
||||
mova m2, m0
|
||||
SUM_SUB 0, 6, 9
|
||||
SUM_SUB 2, 4, 9
|
||||
SUM_SUB 1, 5, 9
|
||||
SUM_SUB 7, 3, 9
|
||||
|
||||
SUM_SUB 3, 5, 9
|
||||
pmulhrsw m3, m12
|
||||
pmulhrsw m5, m12
|
||||
|
||||
SUM_SUB 0, 7, 9
|
||||
SUM_SUB 2, 3, 9
|
||||
SUM_SUB 4, 5, 9
|
||||
SUM_SUB 6, 1, 9
|
||||
|
||||
SWAP 3, 6
|
||||
SWAP 1, 2
|
||||
SWAP 2, 4
|
||||
|
||||
|
||||
pxor m12, m12
|
||||
ADD_STORE_8P_2X 0, 1, 9, 10, 12
|
||||
lea outputq, [outputq + r3]
|
||||
ADD_STORE_8P_2X 2, 3, 9, 10, 12
|
||||
lea outputq, [outputq + r3]
|
||||
ADD_STORE_8P_2X 4, 5, 9, 10, 12
|
||||
lea outputq, [outputq + r3]
|
||||
ADD_STORE_8P_2X 6, 7, 9, 10, 12
|
||||
|
||||
RET
|
||||
|
||||
%endif
|
||||
|
Loading…
Reference in New Issue
Block a user