vpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
Scott LaVarnway 0148e20c3c VPX: x86 asm version of vpx_idct32x32_1024_add()
Change-Id: I3ba4ede553e068bf116dce59d1317347988b3542
2015-11-25 10:11:29 -08:00

1235 lines
41 KiB
NASM

;
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "third_party/x86inc/x86inc.asm"
; This file provides SSSE3 version of the inverse transformation. Part
; of the functions are originally derived from the ffmpeg project.
; Note that the current version applies to x86 64-bit only.
SECTION_RODATA
pw_11585x2: times 8 dw 23170
pw_m2404x2: times 8 dw -2404*2
pw_m4756x2: times 8 dw -4756*2
pw_m5520x2: times 8 dw -5520*2
pw_16364x2: times 8 dw 16364*2
pw_16305x2: times 8 dw 16305*2
pw_16207x2: times 8 dw 16207*2
pw_16069x2: times 8 dw 16069*2
pw_15893x2: times 8 dw 15893*2
pw_15679x2: times 8 dw 15679*2
pw_15426x2: times 8 dw 15426*2
pw__3981x2: times 8 dw 3981*2
pw__3196x2: times 8 dw 3196*2
pw__1606x2: times 8 dw 1606*2
pw___804x2: times 8 dw 804*2
pd_8192: times 4 dd 8192
pw_32: times 8 dw 32
pw_16: times 8 dw 16
%macro TRANSFORM_COEFFS 2
pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2
pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1
pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2
%endmacro
TRANSFORM_COEFFS 6270, 15137
TRANSFORM_COEFFS 3196, 16069
TRANSFORM_COEFFS 13623, 9102
; constants for 32x32_34
TRANSFORM_COEFFS 804, 16364
TRANSFORM_COEFFS 15426, 5520
TRANSFORM_COEFFS 3981, 15893
TRANSFORM_COEFFS 16207, 2404
TRANSFORM_COEFFS 1606, 16305
TRANSFORM_COEFFS 15679, 4756
TRANSFORM_COEFFS 11585, 11585
; constants for 32x32_1024
TRANSFORM_COEFFS 12140, 11003
TRANSFORM_COEFFS 7005, 14811
TRANSFORM_COEFFS 14053, 8423
TRANSFORM_COEFFS 9760, 13160
TRANSFORM_COEFFS 12665, 10394
TRANSFORM_COEFFS 7723, 14449
%macro PAIR_PP_COEFFS 2
dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2
%endmacro
%macro PAIR_MP_COEFFS 2
dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2
%endmacro
%macro PAIR_MM_COEFFS 2
dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
%endmacro
PAIR_PP_COEFFS 30274, 12540
PAIR_PP_COEFFS 6392, 32138
PAIR_MP_COEFFS 18204, 27246
PAIR_PP_COEFFS 12540, 12540
PAIR_PP_COEFFS 30274, 30274
PAIR_PP_COEFFS 6392, 6392
PAIR_PP_COEFFS 32138, 32138
PAIR_MM_COEFFS 18204, 18204
PAIR_PP_COEFFS 27246, 27246
SECTION .text
%if ARCH_X86_64
%macro SUM_SUB 3
psubw m%3, m%1, m%2
paddw m%1, m%2
SWAP %2, %3
%endmacro
; butterfly operation
%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
pmaddwd m%1, m%3, %5
pmaddwd m%2, m%3, %6
paddd m%1, %4
paddd m%2, %4
psrad m%1, 14
psrad m%2, 14
%endmacro
%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
punpckhwd m%6, m%2, m%1
MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_%3_%4]
punpcklwd m%2, m%1
MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_%3_%4]
packssdw m%1, m%7
packssdw m%2, m%6
%endmacro
%macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
punpckhwd m%6, m%2, m%1
MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_m%3_m%4]
punpcklwd m%2, m%1
MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_m%3_m%4]
packssdw m%1, m%7
packssdw m%2, m%6
%endmacro
; matrix transpose
%macro INTERLEAVE_2X 4
punpckh%1 m%4, m%2, m%3
punpckl%1 m%2, m%3
SWAP %3, %4
%endmacro
%macro TRANSPOSE8X8 9
INTERLEAVE_2X wd, %1, %2, %9
INTERLEAVE_2X wd, %3, %4, %9
INTERLEAVE_2X wd, %5, %6, %9
INTERLEAVE_2X wd, %7, %8, %9
INTERLEAVE_2X dq, %1, %3, %9
INTERLEAVE_2X dq, %2, %4, %9
INTERLEAVE_2X dq, %5, %7, %9
INTERLEAVE_2X dq, %6, %8, %9
INTERLEAVE_2X qdq, %1, %5, %9
INTERLEAVE_2X qdq, %3, %7, %9
INTERLEAVE_2X qdq, %2, %6, %9
INTERLEAVE_2X qdq, %4, %8, %9
SWAP %2, %5
SWAP %4, %7
%endmacro
%macro IDCT8_1D 0
SUM_SUB 0, 4, 9
BUTTERFLY_4X 2, 6, 6270, 15137, m8, 9, 10
pmulhrsw m0, m12
pmulhrsw m4, m12
BUTTERFLY_4X 1, 7, 3196, 16069, m8, 9, 10
BUTTERFLY_4X 5, 3, 13623, 9102, m8, 9, 10
SUM_SUB 1, 5, 9
SUM_SUB 7, 3, 9
SUM_SUB 0, 6, 9
SUM_SUB 4, 2, 9
SUM_SUB 3, 5, 9
pmulhrsw m3, m12
pmulhrsw m5, m12
SUM_SUB 0, 7, 9
SUM_SUB 4, 3, 9
SUM_SUB 2, 5, 9
SUM_SUB 6, 1, 9
SWAP 3, 6
SWAP 1, 4
%endmacro
; This macro handles 8 pixels per line
%macro ADD_STORE_8P_2X 5; src1, src2, tmp1, tmp2, zero
paddw m%1, m11
paddw m%2, m11
psraw m%1, 5
psraw m%2, 5
movh m%3, [outputq]
movh m%4, [outputq + strideq]
punpcklbw m%3, m%5
punpcklbw m%4, m%5
paddw m%3, m%1
paddw m%4, m%2
packuswb m%3, m%5
packuswb m%4, m%5
movh [outputq], m%3
movh [outputq + strideq], m%4
%endmacro
INIT_XMM ssse3
; full inverse 8x8 2D-DCT transform
cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
mova m8, [pd_8192]
mova m11, [pw_16]
mova m12, [pw_11585x2]
lea r3, [2 * strideq]
mova m0, [inputq + 0]
mova m1, [inputq + 16]
mova m2, [inputq + 32]
mova m3, [inputq + 48]
mova m4, [inputq + 64]
mova m5, [inputq + 80]
mova m6, [inputq + 96]
mova m7, [inputq + 112]
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
IDCT8_1D
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
IDCT8_1D
pxor m12, m12
ADD_STORE_8P_2X 0, 1, 9, 10, 12
lea outputq, [outputq + r3]
ADD_STORE_8P_2X 2, 3, 9, 10, 12
lea outputq, [outputq + r3]
ADD_STORE_8P_2X 4, 5, 9, 10, 12
lea outputq, [outputq + r3]
ADD_STORE_8P_2X 6, 7, 9, 10, 12
RET
; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
mova m8, [pd_8192]
mova m11, [pw_16]
mova m12, [pw_11585x2]
lea r3, [2 * strideq]
mova m0, [inputq + 0]
mova m1, [inputq + 16]
mova m2, [inputq + 32]
mova m3, [inputq + 48]
punpcklwd m0, m1
punpcklwd m2, m3
punpckhdq m9, m0, m2
punpckldq m0, m2
SWAP 2, 9
; m0 -> [0], [0]
; m1 -> [1], [1]
; m2 -> [2], [2]
; m3 -> [3], [3]
punpckhqdq m10, m0, m0
punpcklqdq m0, m0
punpckhqdq m9, m2, m2
punpcklqdq m2, m2
SWAP 1, 10
SWAP 3, 9
pmulhrsw m0, m12
pmulhrsw m2, [dpw_30274_12540]
pmulhrsw m1, [dpw_6392_32138]
pmulhrsw m3, [dpw_m18204_27246]
SUM_SUB 0, 2, 9
SUM_SUB 1, 3, 9
punpcklqdq m9, m3, m3
punpckhqdq m5, m3, m9
SUM_SUB 3, 5, 9
punpckhqdq m5, m3
pmulhrsw m5, m12
punpckhqdq m9, m1, m5
punpcklqdq m1, m5
SWAP 5, 9
SUM_SUB 0, 5, 9
SUM_SUB 2, 1, 9
punpckhqdq m3, m0, m0
punpckhqdq m4, m1, m1
punpckhqdq m6, m5, m5
punpckhqdq m7, m2, m2
punpcklwd m0, m3
punpcklwd m7, m2
punpcklwd m1, m4
punpcklwd m6, m5
punpckhdq m4, m0, m7
punpckldq m0, m7
punpckhdq m10, m1, m6
punpckldq m5, m1, m6
punpckhqdq m1, m0, m5
punpcklqdq m0, m5
punpckhqdq m3, m4, m10
punpcklqdq m2, m4, m10
pmulhrsw m0, m12
pmulhrsw m6, m2, [dpw_30274_30274]
pmulhrsw m4, m2, [dpw_12540_12540]
pmulhrsw m7, m1, [dpw_32138_32138]
pmulhrsw m1, [dpw_6392_6392]
pmulhrsw m5, m3, [dpw_m18204_m18204]
pmulhrsw m3, [dpw_27246_27246]
mova m2, m0
SUM_SUB 0, 6, 9
SUM_SUB 2, 4, 9
SUM_SUB 1, 5, 9
SUM_SUB 7, 3, 9
SUM_SUB 3, 5, 9
pmulhrsw m3, m12
pmulhrsw m5, m12
SUM_SUB 0, 7, 9
SUM_SUB 2, 3, 9
SUM_SUB 4, 5, 9
SUM_SUB 6, 1, 9
SWAP 3, 6
SWAP 1, 2
SWAP 2, 4
pxor m12, m12
ADD_STORE_8P_2X 0, 1, 9, 10, 12
lea outputq, [outputq + r3]
ADD_STORE_8P_2X 2, 3, 9, 10, 12
lea outputq, [outputq + r3]
ADD_STORE_8P_2X 4, 5, 9, 10, 12
lea outputq, [outputq + r3]
ADD_STORE_8P_2X 6, 7, 9, 10, 12
RET
%define idx0 16 * 0
%define idx1 16 * 1
%define idx2 16 * 2
%define idx3 16 * 3
%define idx4 16 * 4
%define idx5 16 * 5
%define idx6 16 * 6
%define idx7 16 * 7
%define idx8 16 * 0
%define idx9 16 * 1
%define idx10 16 * 2
%define idx11 16 * 3
%define idx12 16 * 4
%define idx13 16 * 5
%define idx14 16 * 6
%define idx15 16 * 7
%define idx16 16 * 0
%define idx17 16 * 1
%define idx18 16 * 2
%define idx19 16 * 3
%define idx20 16 * 4
%define idx21 16 * 5
%define idx22 16 * 6
%define idx23 16 * 7
%define idx24 16 * 0
%define idx25 16 * 1
%define idx26 16 * 2
%define idx27 16 * 3
%define idx28 16 * 4
%define idx29 16 * 5
%define idx30 16 * 6
%define idx31 16 * 7
; FROM idct32x32_add_neon.asm
;
; Instead of doing the transforms stage by stage, it is done by loading
; some input values and doing as many stages as possible to minimize the
; storing/loading of intermediate results. To fit within registers, the
; final coefficients are cut into four blocks:
; BLOCK A: 16-19,28-31
; BLOCK B: 20-23,24-27
; BLOCK C: 8-11,12-15
; BLOCK D: 0-3,4-7
; Blocks A and C are straight calculation through the various stages. In
; block B, further calculations are performed using the results from
; block A. In block D, further calculations are performed using the results
; from block C and then the final calculations are done using results from
; block A and B which have been combined at the end of block B.
;
%macro IDCT32X32_34 4
; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m11, m1
pmulhrsw m1, [pw___804x2] ; stp1_16
mova [r4 + 0], m0
pmulhrsw m11, [pw_16364x2] ; stp2_31
mova [r4 + 16 * 2], m2
mova m12, m7
pmulhrsw m7, [pw_15426x2] ; stp1_28
mova [r4 + 16 * 4], m4
pmulhrsw m12, [pw_m5520x2] ; stp2_19
mova [r4 + 16 * 6], m6
; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m2, m1 ; stp1_16
mova m0, m11 ; stp1_31
mova m4, m7 ; stp1_28
mova m15, m12 ; stp1_19
; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
BUTTERFLY_4Xmm 4, 15, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
SUM_SUB 0, 15, 9 ; stp2_17, stp2_18
SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X 4, 15, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m6, m5
pmulhrsw m5, [pw__3981x2] ; stp1_20
mova [stp + %4 + idx28], m12
mova [stp + %4 + idx29], m15
pmulhrsw m6, [pw_15893x2] ; stp2_27
mova [stp + %4 + idx30], m2
mova m2, m3
pmulhrsw m3, [pw_m2404x2] ; stp1_23
mova [stp + %4 + idx31], m11
pmulhrsw m2, [pw_16207x2] ; stp2_24
; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m13, m5 ; stp1_20
mova m14, m6 ; stp1_27
mova m15, m3 ; stp1_23
mova m11, m2 ; stp1_24
; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
BUTTERFLY_4Xmm 11, 15, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
SUM_SUB 15, 14, 9 ; stp2_22, stp2_21
SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
SUM_SUB 11, 13, 9 ; stp2_25, stp2_26
; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 1, 3, 9 ; stp2_16, stp2_23
SUM_SUB 0, 15, 9 ; stp2_17, stp2_22
SUM_SUB 4, 14, 9 ; stp2_18, stp2_21
SUM_SUB 7, 5, 9 ; stp2_19, stp2_20
mova [stp + %3 + idx16], m1
mova [stp + %3 + idx17], m0
mova [stp + %3 + idx18], m4
mova [stp + %3 + idx19], m7
mova m4, [stp + %4 + idx28]
mova m7, [stp + %4 + idx29]
mova m10, [stp + %4 + idx30]
mova m12, [stp + %4 + idx31]
SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
SUM_SUB 10, 11, 9 ; stp2_30, stp2_25
SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
mova [stp + %4 + idx28], m4
mova [stp + %4 + idx29], m7
mova [stp + %4 + idx30], m10
mova [stp + %4 + idx31], m12
; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
%if 0 ; overflow occurs in SUM_SUB when using test streams
mova m10, [pw_11585x2]
SUM_SUB 6, 5, 9
pmulhrsw m6, m10 ; stp1_27
pmulhrsw m5, m10 ; stp1_20
SUM_SUB 13, 14, 9
pmulhrsw m13, m10 ; stp1_26
pmulhrsw m14, m10 ; stp1_21
SUM_SUB 11, 15, 9
pmulhrsw m11, m10 ; stp1_25
pmulhrsw m15, m10 ; stp1_22
SUM_SUB 2, 3, 9
pmulhrsw m2, m10 ; stp1_24
pmulhrsw m3, m10 ; stp1_23
%else
BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
SWAP 6, 5
BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
SWAP 13, 14
BUTTERFLY_4X 11, 15, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
SWAP 11, 15
BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
SWAP 2, 3
%endif
mova [stp + %4 + idx24], m2
mova [stp + %4 + idx25], m11
mova [stp + %4 + idx26], m13
mova [stp + %4 + idx27], m6
; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;
; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m0, [rsp + transposed_in + 16 * 2]
mova m6, [rsp + transposed_in + 16 * 6]
mova m1, m0
pmulhrsw m0, [pw__1606x2] ; stp1_8
mova [stp + %3 + idx20], m5
mova [stp + %3 + idx21], m14
pmulhrsw m1, [pw_16305x2] ; stp2_15
mova [stp + %3 + idx22], m15
mova m7, m6
pmulhrsw m7, [pw_m4756x2] ; stp2_11
mova [stp + %3 + idx23], m3
pmulhrsw m6, [pw_15679x2] ; stp1_12
; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m3, m0 ; stp1_8
mova m2, m1 ; stp1_15
; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
mova m4, m7 ; stp1_11
mova m5, m6 ; stp1_12
BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
%if 0 ; overflow occurs in SUM_SUB when using test streams
mova m10, [pw_11585x2]
SUM_SUB 5, 4, 9
pmulhrsw m5, m10 ; stp1_13
pmulhrsw m4, m10 ; stp1_10
SUM_SUB 6, 7, 9
pmulhrsw m6, m10 ; stp1_12
pmulhrsw m7, m10 ; stp1_11
%else
BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
SWAP 5, 4
BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
SWAP 6, 7
%endif
; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova [stp + %2 + idx8], m0
mova [stp + %2 + idx9], m2
mova [stp + %2 + idx10], m4
mova [stp + %2 + idx11], m7
; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;
; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;
; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m11, [rsp + transposed_in + 16 * 4]
mova m12, m11
pmulhrsw m11, [pw__3196x2] ; stp1_4
pmulhrsw m12, [pw_16069x2] ; stp1_7
; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m0, [rsp + transposed_in + 16 * 0]
mova m10, [pw_11585x2]
pmulhrsw m0, m10 ; stp1_1
mova m14, m11 ; stp1_4
mova m13, m12 ; stp1_7
; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
%if 0 ; overflow occurs in SUM_SUB when using test streams
SUM_SUB 13, 14, 9
pmulhrsw m13, m10 ; stp1_6
pmulhrsw m14, m10 ; stp1_5
%else
BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
SWAP 13, 14
%endif
mova m7, m0 ; stp1_0 = stp1_1
mova m4, m0 ; stp1_1
mova m2, m7 ; stp1_0
; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
SUM_SUB 7, 13, 9 ; stp1_1, stp1_6
SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
SUM_SUB 4, 11, 9 ; stp1_3, stp1_4
; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 0, 1, 9 ; stp1_0, stp1_15
SUM_SUB 7, 3, 9 ; stp1_1, stp1_14
SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
SUM_SUB 4, 6, 9 ; stp1_3, stp1_12
; 0-3, 28-31 final stage
mova m15, [stp + %4 + idx30]
mova m10, [stp + %4 + idx31]
SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
SUM_SUB 7, 15, 9 ; stp1_1, stp1_30
mova [stp + %1 + idx0], m0
mova [stp + %1 + idx1], m7
mova [stp + %4 + idx30], m15
mova [stp + %4 + idx31], m10
mova m7, [stp + %4 + idx28]
mova m0, [stp + %4 + idx29]
SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
SUM_SUB 4, 7, 9 ; stp1_3, stp1_28
mova [stp + %1 + idx2], m2
mova [stp + %1 + idx3], m4
mova [stp + %4 + idx28], m7
mova [stp + %4 + idx29], m0
; 12-15, 16-19 final stage
mova m0, [stp + %3 + idx16]
mova m7, [stp + %3 + idx17]
mova m2, [stp + %3 + idx18]
mova m4, [stp + %3 + idx19]
SUM_SUB 1, 0, 9 ; stp1_15, stp1_16
SUM_SUB 3, 7, 9 ; stp1_14, stp1_17
SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
SUM_SUB 6, 4, 9 ; stp1_12, stp1_19
mova [stp + %2 + idx12], m6
mova [stp + %2 + idx13], m5
mova [stp + %2 + idx14], m3
mova [stp + %2 + idx15], m1
mova [stp + %3 + idx16], m0
mova [stp + %3 + idx17], m7
mova [stp + %3 + idx18], m2
mova [stp + %3 + idx19], m4
mova m4, [stp + %2 + idx8]
mova m5, [stp + %2 + idx9]
mova m6, [stp + %2 + idx10]
mova m7, [stp + %2 + idx11]
SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
; 4-7, 24-27 final stage
mova m0, [stp + %4 + idx27]
mova m1, [stp + %4 + idx26]
mova m2, [stp + %4 + idx25]
mova m3, [stp + %4 + idx24]
SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
mova [stp + %4 + idx27], m0
mova [stp + %4 + idx26], m1
mova [stp + %4 + idx25], m2
mova [stp + %4 + idx24], m3
mova [stp + %1 + idx4], m11
mova [stp + %1 + idx5], m14
mova [stp + %1 + idx6], m13
mova [stp + %1 + idx7], m12
; 8-11, 20-23 final stage
mova m0, [stp + %3 + idx20]
mova m1, [stp + %3 + idx21]
mova m2, [stp + %3 + idx22]
mova m3, [stp + %3 + idx23]
SUM_SUB 7, 0, 9 ; stp1_11, stp_20
SUM_SUB 6, 1, 9 ; stp1_10, stp_21
SUM_SUB 5, 2, 9 ; stp1_9, stp_22
SUM_SUB 4, 3, 9 ; stp1_8, stp_23
mova [stp + %2 + idx8], m4
mova [stp + %2 + idx9], m5
mova [stp + %2 + idx10], m6
mova [stp + %2 + idx11], m7
mova [stp + %3 + idx20], m0
mova [stp + %3 + idx21], m1
mova [stp + %3 + idx22], m2
mova [stp + %3 + idx23], m3
%endmacro
%macro RECON_AND_STORE 1
mova m11, [pw_32]
lea stp, [rsp + %1]
mov r6, 32
pxor m8, m8
%%recon_and_store:
mova m0, [stp + 16 * 32 * 0]
mova m1, [stp + 16 * 32 * 1]
mova m2, [stp + 16 * 32 * 2]
mova m3, [stp + 16 * 32 * 3]
add stp, 16
paddw m0, m11
paddw m1, m11
paddw m2, m11
paddw m3, m11
psraw m0, 6
psraw m1, 6
psraw m2, 6
psraw m3, 6
movh m4, [outputq + 0]
movh m5, [outputq + 8]
movh m6, [outputq + 16]
movh m7, [outputq + 24]
punpcklbw m4, m8
punpcklbw m5, m8
punpcklbw m6, m8
punpcklbw m7, m8
paddw m0, m4
paddw m1, m5
paddw m2, m6
paddw m3, m7
packuswb m0, m1
packuswb m2, m3
mova [outputq + 0], m0
mova [outputq + 16], m2
lea outputq, [outputq + strideq]
dec r6
jnz %%recon_and_store
%endmacro
%define i32x32_size 16*32*5
%define pass_two_start 16*32*0
%define transposed_in 16*32*4
%define pass_one_start 16*32*0
%define stp r8
INIT_XMM ssse3
cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride
mova m8, [pd_8192]
lea stp, [rsp + pass_one_start]
idct32x32_34:
mov r3, inputq
lea r4, [rsp + transposed_in]
idct32x32_34_transpose:
mova m0, [r3 + 0]
mova m1, [r3 + 16 * 4]
mova m2, [r3 + 16 * 8]
mova m3, [r3 + 16 * 12]
mova m4, [r3 + 16 * 16]
mova m5, [r3 + 16 * 20]
mova m6, [r3 + 16 * 24]
mova m7, [r3 + 16 * 28]
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
IDCT32X32_34 16*0, 16*32, 16*64, 16*96
lea stp, [stp + 16 * 8]
mov r6, 4
lea stp, [rsp + pass_one_start]
lea r9, [rsp + pass_one_start]
idct32x32_34_2:
lea r4, [rsp + transposed_in]
mov r3, r9
idct32x32_34_transpose_2:
mova m0, [r3 + 0]
mova m1, [r3 + 16 * 1]
mova m2, [r3 + 16 * 2]
mova m3, [r3 + 16 * 3]
mova m4, [r3 + 16 * 4]
mova m5, [r3 + 16 * 5]
mova m6, [r3 + 16 * 6]
mova m7, [r3 + 16 * 7]
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
IDCT32X32_34 16*0, 16*8, 16*16, 16*24
lea stp, [stp + 16 * 32]
add r9, 16 * 32
dec r6
jnz idct32x32_34_2
RECON_AND_STORE pass_two_start
RET
%macro IDCT32X32_1024 4
; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m1, [rsp + transposed_in + 16 * 1]
mova m11, [rsp + transposed_in + 16 * 31]
BUTTERFLY_4X 1, 11, 804, 16364, m8, 9, 10 ; stp1_16, stp1_31
mova m0, [rsp + transposed_in + 16 * 15]
mova m2, [rsp + transposed_in + 16 * 17]
BUTTERFLY_4X 2, 0, 12140, 11003, m8, 9, 10 ; stp1_17, stp1_30
mova m7, [rsp + transposed_in + 16 * 7]
mova m12, [rsp + transposed_in + 16 * 25]
BUTTERFLY_4X 12, 7, 15426, 5520, m8, 9, 10 ; stp1_19, stp1_28
mova m3, [rsp + transposed_in + 16 * 9]
mova m4, [rsp + transposed_in + 16 * 23]
BUTTERFLY_4X 3, 4, 7005, 14811, m8, 9, 10 ; stp1_18, stp1_29
; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 1, 2, 9 ; stp2_16, stp2_17
SUM_SUB 12, 3, 9 ; stp2_19, stp2_18
SUM_SUB 7, 4, 9 ; stp2_28, stp2_29
SUM_SUB 11, 0, 9 ; stp2_31, stp2_30
; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
SUM_SUB 0, 3, 9 ; stp2_17, stp2_18
SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
mova [stp + %3 + idx16], m1
mova [stp + %3 + idx17], m0
mova [stp + %3 + idx18], m4
mova [stp + %3 + idx19], m7
mova [stp + %4 + idx28], m12
mova [stp + %4 + idx29], m3
mova [stp + %4 + idx30], m2
mova [stp + %4 + idx31], m11
; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m5, [rsp + transposed_in + 16 * 5]
mova m6, [rsp + transposed_in + 16 * 27]
BUTTERFLY_4X 5, 6, 3981, 15893, m8, 9, 10 ; stp1_20, stp1_27
mova m13, [rsp + transposed_in + 16 * 21]
mova m14, [rsp + transposed_in + 16 * 11]
BUTTERFLY_4X 13, 14, 14053, 8423, m8, 9, 10 ; stp1_21, stp1_26
mova m0, [rsp + transposed_in + 16 * 13]
mova m1, [rsp + transposed_in + 16 * 19]
BUTTERFLY_4X 0, 1, 9760, 13160, m8, 9, 10 ; stp1_22, stp1_25
mova m2, [rsp + transposed_in + 16 * 3]
mova m3, [rsp + transposed_in + 16 * 29]
BUTTERFLY_4X 3, 2, 16207, 2404, m8, 9, 10 ; stp1_23, stp1_24
; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 5, 13, 9 ; stp2_20, stp2_21
SUM_SUB 3, 0, 9 ; stp2_23, stp2_22
SUM_SUB 2, 1, 9 ; stp2_24, stp2_25
SUM_SUB 6, 14, 9 ; stp2_27, stp2_26
; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
SUM_SUB 0, 14, 9 ; stp2_22, stp2_21
SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
SUM_SUB 1, 13, 9 ; stp2_25, stp2_26
; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m4, [stp + %3 + idx16]
mova m7, [stp + %3 + idx17]
mova m11, [stp + %3 + idx18]
mova m12, [stp + %3 + idx19]
SUM_SUB 4, 3, 9 ; stp2_16, stp2_23
SUM_SUB 7, 0, 9 ; stp2_17, stp2_22
SUM_SUB 11, 14, 9 ; stp2_18, stp2_21
SUM_SUB 12, 5, 9 ; stp2_19, stp2_20
mova [stp + %3 + idx16], m4
mova [stp + %3 + idx17], m7
mova [stp + %3 + idx18], m11
mova [stp + %3 + idx19], m12
mova m4, [stp + %4 + idx28]
mova m7, [stp + %4 + idx29]
mova m11, [stp + %4 + idx30]
mova m12, [stp + %4 + idx31]
SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
SUM_SUB 11, 1, 9 ; stp2_30, stp2_25
SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
mova [stp + %4 + idx28], m4
mova [stp + %4 + idx29], m7
mova [stp + %4 + idx30], m11
mova [stp + %4 + idx31], m12
; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
%if 0 ; overflow occurs in SUM_SUB when using test streams
mova m10, [pw_11585x2]
SUM_SUB 6, 5, 9
pmulhrsw m6, m10 ; stp1_27
pmulhrsw m5, m10 ; stp1_20
SUM_SUB 13, 14, 9
pmulhrsw m13, m10 ; stp1_26
pmulhrsw m14, m10 ; stp1_21
SUM_SUB 1, 0, 9
pmulhrsw m1, m10 ; stp1_25
pmulhrsw m0, m10 ; stp1_22
SUM_SUB 2, 3, 9
pmulhrsw m2, m10 ; stp1_25
pmulhrsw m3, m10 ; stp1_22
%else
BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
SWAP 6, 5
BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
SWAP 13, 14
BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
SWAP 1, 0
BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
SWAP 2, 3
%endif
mova [stp + %3 + idx20], m5
mova [stp + %3 + idx21], m14
mova [stp + %3 + idx22], m0
mova [stp + %3 + idx23], m3
mova [stp + %4 + idx24], m2
mova [stp + %4 + idx25], m1
mova [stp + %4 + idx26], m13
mova [stp + %4 + idx27], m6
; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;
; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m0, [rsp + transposed_in + 16 * 2]
mova m1, [rsp + transposed_in + 16 * 30]
BUTTERFLY_4X 0, 1, 1606, 16305, m8, 9, 10 ; stp1_8, stp1_15
mova m2, [rsp + transposed_in + 16 * 14]
mova m3, [rsp + transposed_in + 16 * 18]
BUTTERFLY_4X 3, 2, 12665, 10394, m8, 9, 10 ; stp1_9, stp1_14
mova m4, [rsp + transposed_in + 16 * 10]
mova m5, [rsp + transposed_in + 16 * 22]
BUTTERFLY_4X 4, 5, 7723, 14449, m8, 9, 10 ; stp1_10, stp1_13
mova m6, [rsp + transposed_in + 16 * 6]
mova m7, [rsp + transposed_in + 16 * 26]
BUTTERFLY_4X 7, 6, 15679, 4756, m8, 9, 10 ; stp1_11, stp1_12
; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 0, 3, 9 ; stp1_8, stp1_9
SUM_SUB 7, 4, 9 ; stp1_11, stp1_10
SUM_SUB 6, 5, 9 ; stp1_12, stp1_13
SUM_SUB 1, 2, 9 ; stp1_15, stp1_14
; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
%if 0 ; overflow occurs in SUM_SUB when using test streams
mova m10, [pw_11585x2]
SUM_SUB 5, 4, 9
pmulhrsw m5, m10 ; stp1_13
pmulhrsw m4, m10 ; stp1_10
SUM_SUB 6, 7, 9
pmulhrsw m6, m10 ; stp1_12
pmulhrsw m7, m10 ; stp1_11
%else
BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
SWAP 5, 4
BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
SWAP 6, 7
%endif
; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova [stp + %2 + idx8], m0
mova [stp + %2 + idx9], m2
mova [stp + %2 + idx10], m4
mova [stp + %2 + idx11], m7
mova [stp + %2 + idx12], m6
mova [stp + %2 + idx13], m5
mova [stp + %2 + idx14], m3
mova [stp + %2 + idx15], m1
; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;
; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;
; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m11, [rsp + transposed_in + 16 * 4]
mova m12, [rsp + transposed_in + 16 * 28]
BUTTERFLY_4X 11, 12, 3196, 16069, m8, 9, 10 ; stp1_4, stp1_7
mova m13, [rsp + transposed_in + 16 * 12]
mova m14, [rsp + transposed_in + 16 * 20]
BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_5, stp1_6
; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m0, [rsp + transposed_in + 16 * 0]
mova m1, [rsp + transposed_in + 16 * 16]
%if 0 ; overflow occurs in SUM_SUB when using test streams
mova m10, [pw_11585x2]
SUM_SUB 0, 1, 9
pmulhrsw m0, m10 ; stp1_1
pmulhrsw m1, m10 ; stp1_0
%else
BUTTERFLY_4X 0, 1, 11585, 11585, m8, 9, 10 ; stp1_1, stp1_0
SWAP 0, 1
%endif
mova m2, [rsp + transposed_in + 16 * 8]
mova m3, [rsp + transposed_in + 16 * 24]
BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_2, stp1_3
mova m10, [pw_11585x2]
SUM_SUB 11, 14, 9 ; stp1_4, stp1_5
SUM_SUB 12, 13, 9 ; stp1_7, stp1_6
; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
%if 0 ; overflow occurs in SUM_SUB when using test streams
SUM_SUB 13, 14, 9
pmulhrsw m13, m10 ; stp1_6
pmulhrsw m14, m10 ; stp1_5
%else
BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
SWAP 13, 14
%endif
SUM_SUB 0, 3, 9 ; stp1_0, stp1_3
SUM_SUB 1, 2, 9 ; stp1_1, stp1_2
; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
SUM_SUB 1, 13, 9 ; stp1_1, stp1_6
SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
SUM_SUB 3, 11, 9 ; stp1_3, stp1_4
; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m4, [stp + %2 + idx12]
mova m5, [stp + %2 + idx13]
mova m6, [stp + %2 + idx14]
mova m7, [stp + %2 + idx15]
SUM_SUB 0, 7, 9 ; stp1_0, stp1_15
SUM_SUB 1, 6, 9 ; stp1_1, stp1_14
SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
SUM_SUB 3, 4, 9 ; stp1_3, stp1_12
; 0-3, 28-31 final stage
mova m10, [stp + %4 + idx31]
mova m15, [stp + %4 + idx30]
SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
SUM_SUB 1, 15, 9 ; stp1_1, stp1_30
mova [stp + %1 + idx0], m0
mova [stp + %1 + idx1], m1
mova [stp + %4 + idx31], m10
mova [stp + %4 + idx30], m15
mova m0, [stp + %4 + idx29]
mova m1, [stp + %4 + idx28]
SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
SUM_SUB 3, 1, 9 ; stp1_3, stp1_28
mova [stp + %1 + idx2], m2
mova [stp + %1 + idx3], m3
mova [stp + %4 + idx29], m0
mova [stp + %4 + idx28], m1
; 12-15, 16-19 final stage
mova m0, [stp + %3 + idx16]
mova m1, [stp + %3 + idx17]
mova m2, [stp + %3 + idx18]
mova m3, [stp + %3 + idx19]
SUM_SUB 7, 0, 9 ; stp1_15, stp1_16
SUM_SUB 6, 1, 9 ; stp1_14, stp1_17
SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
SUM_SUB 4, 3, 9 ; stp1_12, stp1_19
mova [stp + %2 + idx12], m4
mova [stp + %2 + idx13], m5
mova [stp + %2 + idx14], m6
mova [stp + %2 + idx15], m7
mova [stp + %3 + idx16], m0
mova [stp + %3 + idx17], m1
mova [stp + %3 + idx18], m2
mova [stp + %3 + idx19], m3
mova m4, [stp + %2 + idx8]
mova m5, [stp + %2 + idx9]
mova m6, [stp + %2 + idx10]
mova m7, [stp + %2 + idx11]
SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
; 4-7, 24-27 final stage
mova m3, [stp + %4 + idx24]
mova m2, [stp + %4 + idx25]
mova m1, [stp + %4 + idx26]
mova m0, [stp + %4 + idx27]
SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
mova [stp + %4 + idx24], m3
mova [stp + %4 + idx25], m2
mova [stp + %4 + idx26], m1
mova [stp + %4 + idx27], m0
mova [stp + %1 + idx4], m11
mova [stp + %1 + idx5], m14
mova [stp + %1 + idx6], m13
mova [stp + %1 + idx7], m12
; 8-11, 20-23 final stage
mova m0, [stp + %3 + idx20]
mova m1, [stp + %3 + idx21]
mova m2, [stp + %3 + idx22]
mova m3, [stp + %3 + idx23]
SUM_SUB 7, 0, 9 ; stp1_11, stp_20
SUM_SUB 6, 1, 9 ; stp1_10, stp_21
SUM_SUB 5, 2, 9 ; stp1_9, stp_22
SUM_SUB 4, 3, 9 ; stp1_8, stp_23
mova [stp + %2 + idx8], m4
mova [stp + %2 + idx9], m5
mova [stp + %2 + idx10], m6
mova [stp + %2 + idx11], m7
mova [stp + %3 + idx20], m0
mova [stp + %3 + idx21], m1
mova [stp + %3 + idx22], m2
mova [stp + %3 + idx23], m3
%endmacro
INIT_XMM ssse3
cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride
mova m8, [pd_8192]
mov r6, 4
lea stp, [rsp + pass_one_start]
idct32x32_1024:
mov r3, inputq
lea r4, [rsp + transposed_in]
mov r7, 4
idct32x32_1024_transpose:
mova m0, [r3 + 0]
mova m1, [r3 + 16 * 4]
mova m2, [r3 + 16 * 8]
mova m3, [r3 + 16 * 12]
mova m4, [r3 + 16 * 16]
mova m5, [r3 + 16 * 20]
mova m6, [r3 + 16 * 24]
mova m7, [r3 + 16 * 28]
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
mova [r4 + 0], m0
mova [r4 + 16 * 1], m1
mova [r4 + 16 * 2], m2
mova [r4 + 16 * 3], m3
mova [r4 + 16 * 4], m4
mova [r4 + 16 * 5], m5
mova [r4 + 16 * 6], m6
mova [r4 + 16 * 7], m7
add r3, 16
add r4, 16 * 8
dec r7
jne idct32x32_1024_transpose
IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
lea stp, [stp + 16 * 8]
lea inputq, [inputq + 16 * 32]
dec r6
jnz idct32x32_1024
mov r6, 4
lea stp, [rsp + pass_one_start]
lea r9, [rsp + pass_one_start]
idct32x32_1024_2:
lea r4, [rsp + transposed_in]
mov r3, r9
mov r7, 4
idct32x32_1024_transpose_2:
mova m0, [r3 + 0]
mova m1, [r3 + 16 * 1]
mova m2, [r3 + 16 * 2]
mova m3, [r3 + 16 * 3]
mova m4, [r3 + 16 * 4]
mova m5, [r3 + 16 * 5]
mova m6, [r3 + 16 * 6]
mova m7, [r3 + 16 * 7]
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
mova [r4 + 0], m0
mova [r4 + 16 * 1], m1
mova [r4 + 16 * 2], m2
mova [r4 + 16 * 3], m3
mova [r4 + 16 * 4], m4
mova [r4 + 16 * 5], m5
mova [r4 + 16 * 6], m6
mova [r4 + 16 * 7], m7
add r3, 16 * 8
add r4, 16 * 8
dec r7
jne idct32x32_1024_transpose_2
IDCT32X32_1024 16*0, 16*8, 16*16, 16*24
lea stp, [stp + 16 * 32]
add r9, 16 * 32
dec r6
jnz idct32x32_1024_2
RECON_AND_STORE pass_two_start
RET
%endif