SSSE3 8x8 inverse 2D-DCT with first 10 coeffs non-zero

This commit enables ssse3 assembly implementation of the 8x8
inverse 2D-DCT with only first 10 coefficients non-zero. The
average runtime for this unit goes down from 198 cycles to 129
cycles (34.8% faster).

Change-Id: Ie7fa4386f6d3a2fe0d47a2eb26fc2a6bbc592ac7
This commit is contained in:
Jingning Han 2014-05-02 16:29:08 -07:00
parent 928ff03889
commit 9e7b09bc5d
2 changed files with 139 additions and 1 deletions

View File

@ -313,7 +313,7 @@ add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int
specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/, "$ssse3_x86_64";
add_proto qw/void vp9_idct8x8_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_10_add sse2 neon dspr2/;
specialize qw/vp9_idct8x8_10_add sse2 neon dspr2/, "$ssse3_x86_64";
add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_1_add sse2 neon dspr2/;

View File

@ -28,6 +28,29 @@ TRANSFORM_COEFFS 6270, 15137
TRANSFORM_COEFFS 3196, 16069
TRANSFORM_COEFFS 13623, 9102
%macro PAIR_PP_COEFFS 2
dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2
%endmacro
%macro PAIR_MP_COEFFS 2
dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2
%endmacro
%macro PAIR_MM_COEFFS 2
dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
%endmacro
PAIR_PP_COEFFS 30274, 12540
PAIR_PP_COEFFS 6392, 32138
PAIR_MP_COEFFS 18204, 27246
PAIR_PP_COEFFS 12540, 12540
PAIR_PP_COEFFS 30274, 30274
PAIR_PP_COEFFS 6392, 6392
PAIR_PP_COEFFS 32138, 32138
PAIR_MM_COEFFS 18204, 18204
PAIR_PP_COEFFS 27246, 27246
SECTION .text
%if ARCH_X86_64
@ -128,6 +151,7 @@ SECTION .text
%endmacro
INIT_XMM ssse3
; full inverse 8x8 2D-DCT transform
cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
mova m8, [pd_8192]
mova m11, [pw_16]
@ -159,4 +183,118 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
ADD_STORE_8P_2X 6, 7, 9, 10, 12
RET
; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
cglobal idct8x8_10_add, 3, 5, 13, input, output, stride
mova m8, [pd_8192]
mova m11, [pw_16]
mova m12, [pw_11585x2]
lea r3, [2 * strideq]
mova m0, [inputq + 0]
mova m1, [inputq + 16]
mova m2, [inputq + 32]
mova m3, [inputq + 48]
punpcklwd m0, m1
punpcklwd m2, m3
punpckhdq m9, m0, m2
punpckldq m0, m2
SWAP 2, 9
; m0 -> [0], [0]
; m1 -> [1], [1]
; m2 -> [2], [2]
; m3 -> [3], [3]
punpckhqdq m10, m0, m0
punpcklqdq m0, m0
punpckhqdq m9, m2, m2
punpcklqdq m2, m2
SWAP 1, 10
SWAP 3, 9
pmulhrsw m0, m12
pmulhrsw m2, [dpw_30274_12540]
pmulhrsw m1, [dpw_6392_32138]
pmulhrsw m3, [dpw_m18204_27246]
SUM_SUB 0, 2, 9
SUM_SUB 1, 3, 9
punpcklqdq m9, m3, m3
punpckhqdq m5, m3, m9
SUM_SUB 3, 5, 9
punpckhqdq m5, m3
pmulhrsw m5, m12
punpckhqdq m9, m1, m5
punpcklqdq m1, m5
SWAP 5, 9
SUM_SUB 0, 5, 9
SUM_SUB 2, 1, 9
punpckhqdq m3, m0, m0
punpckhqdq m4, m1, m1
punpckhqdq m6, m5, m5
punpckhqdq m7, m2, m2
punpcklwd m0, m3
punpcklwd m7, m2
punpcklwd m1, m4
punpcklwd m6, m5
punpckhdq m4, m0, m7
punpckldq m0, m7
punpckhdq m10, m1, m6
punpckldq m5, m1, m6
punpckhqdq m1, m0, m5
punpcklqdq m0, m5
punpckhqdq m3, m4, m10
punpcklqdq m2, m4, m10
pmulhrsw m0, m12
pmulhrsw m6, m2, [dpw_30274_30274]
pmulhrsw m4, m2, [dpw_12540_12540]
pmulhrsw m7, m1, [dpw_32138_32138]
pmulhrsw m1, [dpw_6392_6392]
pmulhrsw m5, m3, [dpw_m18204_m18204]
pmulhrsw m3, [dpw_27246_27246]
mova m2, m0
SUM_SUB 0, 6, 9
SUM_SUB 2, 4, 9
SUM_SUB 1, 5, 9
SUM_SUB 7, 3, 9
SUM_SUB 3, 5, 9
pmulhrsw m3, m12
pmulhrsw m5, m12
SUM_SUB 0, 7, 9
SUM_SUB 2, 3, 9
SUM_SUB 4, 5, 9
SUM_SUB 6, 1, 9
SWAP 3, 6
SWAP 1, 2
SWAP 2, 4
pxor m12, m12
ADD_STORE_8P_2X 0, 1, 9, 10, 12
lea outputq, [outputq + r3]
ADD_STORE_8P_2X 2, 3, 9, 10, 12
lea outputq, [outputq + r3]
ADD_STORE_8P_2X 4, 5, 9, 10, 12
lea outputq, [outputq + r3]
ADD_STORE_8P_2X 6, 7, 9, 10, 12
RET
%endif