VPX: x86 asm version of vpx_idct32x32_34_add()

Change-Id: Ic81f38998fb1b8d33f5a5d7424c2c41002786cef
This commit is contained in:
Scott LaVarnway 2015-11-17 17:42:24 -08:00
parent 1b63238b67
commit ed833048c2
2 changed files with 487 additions and 1 deletions

View File

@ -893,7 +893,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/;
add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_34_add sse2 neon_asm dspr2 msa/;
specialize qw/vpx_idct32x32_34_add sse2 neon_asm dspr2 msa/, "$ssse3_x86_64_x86inc";
# Need to add 34 eob idct32x32 neon implementation.
$vpx_idct32x32_34_add_neon_asm=vpx_idct32x32_1024_add_neon;

View File

@ -17,18 +17,46 @@
SECTION_RODATA
pw_11585x2: times 8 dw 23170
pw_m2404x2: times 8 dw -2404*2
pw_m4756x2: times 8 dw -4756*2
pw_m5520x2: times 8 dw -5520*2
pw_16364x2: times 8 dw 16364*2
pw_16305x2: times 8 dw 16305*2
pw_16207x2: times 8 dw 16207*2
pw_16069x2: times 8 dw 16069*2
pw_15893x2: times 8 dw 15893*2
pw_15679x2: times 8 dw 15679*2
pw_15426x2: times 8 dw 15426*2
pw__3981x2: times 8 dw 3981*2
pw__3196x2: times 8 dw 3196*2
pw__1606x2: times 8 dw 1606*2
pw___804x2: times 8 dw 804*2
pd_8192: times 4 dd 8192
pw_32: times 8 dw 32
pw_16: times 8 dw 16
%macro TRANSFORM_COEFFS 2
pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2
pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1
pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2
%endmacro
TRANSFORM_COEFFS 6270, 15137
TRANSFORM_COEFFS 3196, 16069
TRANSFORM_COEFFS 13623, 9102
; constants for 32x32_34
TRANSFORM_COEFFS 804, 16364
TRANSFORM_COEFFS 15426, 5520
TRANSFORM_COEFFS 3981, 15893
TRANSFORM_COEFFS 16207, 2404
TRANSFORM_COEFFS 1606, 16305
TRANSFORM_COEFFS 15679, 4756
TRANSFORM_COEFFS 11585, 11585
%macro PAIR_PP_COEFFS 2
dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2
%endmacro
@ -80,6 +108,15 @@ SECTION .text
packssdw m%2, m%6
%endmacro
%macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
punpckhwd m%6, m%2, m%1
MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_m%3_m%4]
punpcklwd m%2, m%1
MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_m%3_m%4]
packssdw m%1, m%7
packssdw m%2, m%6
%endmacro
; matrix transpose
%macro INTERLEAVE_2X 4
punpckh%1 m%4, m%2, m%3
@ -298,4 +335,453 @@ cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
RET
%define idx0 16 * 0
%define idx1 16 * 1
%define idx2 16 * 2
%define idx3 16 * 3
%define idx4 16 * 4
%define idx5 16 * 5
%define idx6 16 * 6
%define idx7 16 * 7
%define idx8 16 * 0
%define idx9 16 * 1
%define idx10 16 * 2
%define idx11 16 * 3
%define idx12 16 * 4
%define idx13 16 * 5
%define idx14 16 * 6
%define idx15 16 * 7
%define idx16 16 * 0
%define idx17 16 * 1
%define idx18 16 * 2
%define idx19 16 * 3
%define idx20 16 * 4
%define idx21 16 * 5
%define idx22 16 * 6
%define idx23 16 * 7
%define idx24 16 * 0
%define idx25 16 * 1
%define idx26 16 * 2
%define idx27 16 * 3
%define idx28 16 * 4
%define idx29 16 * 5
%define idx30 16 * 6
%define idx31 16 * 7
%macro IDCT32X32_34x 4
; FROM idct32x32_add_neon.asm
;
; Instead of doing the transforms stage by stage, it is done by loading
; some input values and doing as many stages as possible to minimize the
; storing/loading of intermediate results. To fit within registers, the
; final coefficients are cut into four blocks:
; BLOCK A: 16-19,28-31
; BLOCK B: 20-23,24-27
; BLOCK C: 8-11,12-15
; BLOCK D: 0-3,4-7
; Blocks A and C are straight calculation through the various stages. In
; block B, further calculations are performed using the results from
; block A. In block D, further calculations are performed using the results
; from block C and then the final calculations are done using results from
; block A and B which have been combined at the end of block B.
;
; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m11, m1
pmulhrsw m1, [pw___804x2] ; stp1_16
mova [r4 + 0], m0
pmulhrsw m11, [pw_16364x2] ; stp2_31
mova [r4 + 16 * 2], m2
mova m12, m7
pmulhrsw m7, [pw_15426x2] ; stp1_28
mova [r4 + 16 * 4], m4
pmulhrsw m12, [pw_m5520x2] ; stp2_19
mova [r4 + 16 * 6], m6
; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m2, m1 ; stp1_16
mova m0, m11 ; stp1_31
mova m4, m7 ; stp1_28
mova m15, m12 ; stp1_19
; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
BUTTERFLY_4Xmm 4, 15, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
SUM_SUB 0, 15, 9 ; stp2_17, stp2_18
SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X 4, 15, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m6, m5
pmulhrsw m5, [pw__3981x2] ; stp1_20
mova [stp + %4 + idx28], m12
mova [stp + %4 + idx29], m15
pmulhrsw m6, [pw_15893x2] ; stp2_27
mova [stp + %4 + idx30], m2
mova m2, m3
pmulhrsw m3, [pw_m2404x2] ; stp1_23
mova [stp + %4 + idx31], m11
pmulhrsw m2, [pw_16207x2] ; stp2_24
; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m13, m5 ; stp1_20
mova m14, m6 ; stp1_27
mova m15, m3 ; stp1_23
mova m11, m2 ; stp1_24
; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
BUTTERFLY_4Xmm 11, 15, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
SUM_SUB 15, 14, 9 ; stp2_22, stp2_21
SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
SUM_SUB 11, 13, 9 ; stp2_25, stp2_26
; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 1, 3, 9 ; stp2_16, stp2_23
SUM_SUB 0, 15, 9 ; stp2_17, stp2_22
SUM_SUB 4, 14, 9 ; stp2_18, stp2_21
SUM_SUB 7, 5, 9 ; stp2_19, stp2_20
mova [stp + %3 + idx16], m1
mova [stp + %3 + idx17], m0
mova [stp + %3 + idx18], m4
mova [stp + %3 + idx19], m7
mova m4, [stp + %4 + idx28]
mova m7, [stp + %4 + idx29]
mova m10, [stp + %4 + idx30]
mova m12, [stp + %4 + idx31]
SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
SUM_SUB 10, 11, 9 ; stp2_30, stp2_25
SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
mova [stp + %4 + idx28], m4
mova [stp + %4 + idx29], m7
mova [stp + %4 + idx30], m10
mova [stp + %4 + idx31], m12
; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
%if 0 ; overflow occurs in SUM_SUB when using test streams
mova m10, [pw_11585x2]
SUM_SUB 6, 5, 9
pmulhrsw m6, m10 ; stp1_27
pmulhrsw m5, m10 ; stp1_20
SUM_SUB 13, 14, 9
pmulhrsw m13, m10 ; stp1_26
pmulhrsw m14, m10 ; stp1_21
SUM_SUB 11, 15, 9
pmulhrsw m11, m10 ; stp1_25
pmulhrsw m15, m10 ; stp1_22
SUM_SUB 2, 3, 9
pmulhrsw m2, m10 ; stp1_24
pmulhrsw m3, m10 ; stp1_23
%else
BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
SWAP 6, 5
BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
SWAP 13, 14
BUTTERFLY_4X 11, 15, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
SWAP 11, 15
BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
SWAP 2, 3
%endif
mova [stp + %4 + idx24], m2
mova [stp + %4 + idx25], m11
mova [stp + %4 + idx26], m13
mova [stp + %4 + idx27], m6
; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;
; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m0, [rsp + transposed_in + 16 * 2]
mova m6, [rsp + transposed_in + 16 * 6]
mova m1, m0
pmulhrsw m0, [pw__1606x2] ; stp1_8
mova [stp + %3 + idx20], m5
mova [stp + %3 + idx21], m14
pmulhrsw m1, [pw_16305x2] ; stp2_15
mova [stp + %3 + idx22], m15
mova m7, m6
pmulhrsw m7, [pw_m4756x2] ; stp2_11
mova [stp + %3 + idx23], m3
pmulhrsw m6, [pw_15679x2] ; stp1_12
; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m3, m0 ; stp1_8
mova m2, m1 ; stp1_15
; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
mova m4, m7 ; stp1_11
mova m5, m6 ; stp1_12
BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
%if 0 ; overflow occurs in SUM_SUB when using test streams
mova m10, [pw_11585x2]
SUM_SUB 5, 4, 9
pmulhrsw m5, m10 ; stp1_13
pmulhrsw m4, m10 ; stp1_10
SUM_SUB 6, 7, 9
pmulhrsw m6, m10 ; stp1_12
pmulhrsw m7, m10 ; stp1_11
%else
BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
SWAP 5, 4
BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
SWAP 6, 7
%endif
; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova [stp + %2 + idx8], m0
mova [stp + %2 + idx9], m2
mova [stp + %2 + idx10], m4
mova [stp + %2 + idx11], m7
; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;
; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;
; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m11, [rsp + transposed_in + 16 * 4]
mova m12, m11
pmulhrsw m11, [pw__3196x2] ; stp1_4
pmulhrsw m12, [pw_16069x2] ; stp1_7
; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m0, [rsp + transposed_in + 16 * 0]
mova m10, [pw_11585x2]
mova m7, m0
pmulhrsw m0, m10 ; stp1_1
pmulhrsw m7, m10 ; stp1_0
mova m14, m11 ; stp1_4
mova m13, m12 ; stp1_7
; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
%if 0 ; overflow occurs in SUM_SUB when using test streams
SUM_SUB 13, 14, 9
pmulhrsw m13, m10 ; stp1_6
pmulhrsw m14, m10 ; stp1_5
%else
BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
SWAP 13, 14
%endif
mova m4, m0 ; stp1_1
mova m2, m7 ; stp1_0
; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
SUM_SUB 7, 13, 9 ; stp1_1, stp1_6
SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
SUM_SUB 4, 11, 9 ; stp1_3, stp1_4
; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 0, 1, 9 ; stp1_0, stp1_15
SUM_SUB 7, 3, 9 ; stp1_1, stp1_14
SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
SUM_SUB 4, 6, 9 ; stp1_3, stp1_12
; 0-3, 28-31 final stage
mova m15, [stp + %4 + idx30]
mova m10, [stp + %4 + idx31]
SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
SUM_SUB 7, 15, 9 ; stp1_1, stp1_30
mova [stp + %1 + idx0], m0
mova [stp + %1 + idx1], m7
mova [stp + %4 + idx30], m15
mova [stp + %4 + idx31], m10
mova m7, [stp + %4 + idx28]
mova m0, [stp + %4 + idx29]
SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
SUM_SUB 4, 7, 9 ; stp1_3, stp1_28
mova [stp + %1 + idx2], m2
mova [stp + %1 + idx3], m4
mova [stp + %4 + idx28], m7
mova [stp + %4 + idx29], m0
; 12-15, 16-19 final stage
mova m0, [stp + %3 + idx16]
mova m7, [stp + %3 + idx17]
mova m2, [stp + %3 + idx18]
mova m4, [stp + %3 + idx19]
SUM_SUB 1, 0, 9 ; stp1_15, stp1_16
SUM_SUB 3, 7, 9 ; stp1_14, stp1_17
SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
SUM_SUB 6, 4, 9 ; stp1_12, stp1_19
mova [stp + %2 + idx12], m6
mova [stp + %2 + idx13], m5
mova [stp + %2 + idx14], m3
mova [stp + %2 + idx15], m1
mova [stp + %3 + idx16], m0
mova [stp + %3 + idx17], m7
mova [stp + %3 + idx18], m2
mova [stp + %3 + idx19], m4
mova m4, [stp + %2 + idx8]
mova m5, [stp + %2 + idx9]
mova m6, [stp + %2 + idx10]
mova m7, [stp + %2 + idx11]
SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
; 4-7, 24-27 final stage
mova m0, [stp + %4 + idx27]
mova m1, [stp + %4 + idx26]
mova m2, [stp + %4 + idx25]
mova m3, [stp + %4 + idx24]
SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
mova [stp + %4 + idx27], m0
mova [stp + %4 + idx26], m1
mova [stp + %4 + idx25], m2
mova [stp + %4 + idx24], m3
mova [stp + %1 + idx4], m11
mova [stp + %1 + idx5], m14
mova [stp + %1 + idx6], m13
mova [stp + %1 + idx7], m12
; 8-11, 20-23 final stage
mova m0, [stp + %3 + idx20]
mova m1, [stp + %3 + idx21]
mova m2, [stp + %3 + idx22]
mova m3, [stp + %3 + idx23]
SUM_SUB 7, 0, 9 ; stp1_11, stp_20
SUM_SUB 6, 1, 9 ; stp1_10, stp_21
SUM_SUB 5, 2, 9 ; stp1_9, stp_22
SUM_SUB 4, 3, 9 ; stp1_8, stp_23
mova [stp + %2 + idx8], m4
mova [stp + %2 + idx9], m5
mova [stp + %2 + idx10], m6
mova [stp + %2 + idx11], m7
mova [stp + %3 + idx20], m0
mova [stp + %3 + idx21], m1
mova [stp + %3 + idx22], m2
mova [stp + %3 + idx23], m3
%endmacro
%macro RECON_AND_STORE 1
mova m11, [pw_32]
lea stp, [rsp + %1]
mov r6, 32
pxor m8, m8
%%recon_and_store:
mova m0, [stp + 16 * 32 * 0]
mova m1, [stp + 16 * 32 * 1]
mova m2, [stp + 16 * 32 * 2]
mova m3, [stp + 16 * 32 * 3]
add stp, 16
paddw m0, m11
paddw m1, m11
paddw m2, m11
paddw m3, m11
psraw m0, 6
psraw m1, 6
psraw m2, 6
psraw m3, 6
movh m4, [outputq + 0]
movh m5, [outputq + 8]
movh m6, [outputq + 16]
movh m7, [outputq + 24]
punpcklbw m4, m8
punpcklbw m5, m8
punpcklbw m6, m8
punpcklbw m7, m8
paddw m0, m4
paddw m1, m5
paddw m2, m6
paddw m3, m7
packuswb m0, m1
packuswb m2, m3
mova [outputq + 0], m0
mova [outputq + 16], m2
lea outputq, [outputq + strideq]
dec r6
jnz %%recon_and_store
%endmacro
%define i32x32_size 16*32*5
%define pass_two_start 16*32*0
%define transposed_in 16*32*4
%define pass_one_start 16*32*0
%define stp r8
INIT_XMM ssse3
cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride
mova m8, [pd_8192]
lea stp, [rsp + pass_one_start]
idct32x32_34:
mov r3, inputq
lea r4, [rsp + transposed_in]
idct32x32_34_transpose:
mova m0, [r3 + 0]
mova m1, [r3 + 16 * 4]
mova m2, [r3 + 16 * 8]
mova m3, [r3 + 16 * 12]
mova m4, [r3 + 16 * 16]
mova m5, [r3 + 16 * 20]
mova m6, [r3 + 16 * 24]
mova m7, [r3 + 16 * 28]
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
IDCT32X32_34x 16*0, 16*32, 16*64, 16*96
lea stp, [stp + 16 * 8]
mov r6, 4
lea stp, [rsp + pass_one_start]
lea r9, [rsp + pass_one_start]
idct32x32_34_2:
lea r4, [rsp + transposed_in]
mov r3, r9
idct32x32_34_transpose_2:
mova m0, [r3 + 0]
mova m1, [r3 + 16 * 1]
mova m2, [r3 + 16 * 2]
mova m3, [r3 + 16 * 3]
mova m4, [r3 + 16 * 4]
mova m5, [r3 + 16 * 5]
mova m6, [r3 + 16 * 6]
mova m7, [r3 + 16 * 7]
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
IDCT32X32_34x 16*0, 16*8, 16*16, 16*24
lea stp, [stp + 16 * 32]
add r9, 16 * 32
dec r6
jnz idct32x32_34_2
RECON_AND_STORE pass_two_start
RET
%endif