ffmpeg/libavcodec/x86/vp9intrapred_16bpp.asm
Ronald S. Bultje ce78729033 vp9: don't keep a stack pointer if we don't need it.
This saves one register in a few cases on 32bit builds with unaligned
stack (e.g. MSVC), making the code slightly easier to maintain.

(Can someone please test this on 32bit+msvc and confirm make fate-vp9
and tests/checkasm/checkasm still work after this patch?)
2015-10-07 08:55:19 -04:00

2136 lines
74 KiB
NASM

;******************************************************************************
;* VP9 Intra prediction SIMD optimizations
;*
;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
;* Copyright (c) 2015 Henrik Gramner <henrik gramner com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
pd_2: times 8 dd 2
pd_4: times 8 dd 4
pd_8: times 8 dd 8
pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15
pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0
pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7
cextern pw_1
cextern pw_1023
cextern pw_4095
cextern pd_16
cextern pd_32
cextern pd_65535;
; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take
; only 3 registers on x86-32, which would make it one cycle faster, but that
; would make the code quite a bit uglier...
SECTION .text
%macro SCRATCH 3-4
%if ARCH_X86_64
SWAP %1, %2
%if %0 == 4
%define reg_%4 m%2
%endif
%else
mova [%3], m%1
%if %0 == 4
%define reg_%4 [%3]
%endif
%endif
%endmacro
%macro UNSCRATCH 3-4
%if ARCH_X86_64
SWAP %1, %2
%else
mova m%1, [%3]
%endif
%if %0 == 4
%undef reg_%4
%endif
%endmacro
%macro PRELOAD 2-3
%if ARCH_X86_64
mova m%1, [%2]
%if %0 == 3
%define reg_%3 m%1
%endif
%elif %0 == 3
%define reg_%3 [%2]
%endif
%endmacro
INIT_MMX mmx
cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
movifnidn aq, amp
mova m0, [aq]
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
RET
INIT_XMM sse
cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a
movifnidn aq, amp
mova m0, [aq]
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
RET
INIT_XMM sse
cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a
movifnidn aq, amp
mova m0, [aq]
mova m1, [aq+mmsize]
DEFINE_ARGS dst, stride, stride3, cnt
lea stride3q, [strideq*3]
mov cntd, 4
.loop:
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m1
mova [dstq+strideq*1+ 0], m0
mova [dstq+strideq*1+16], m1
mova [dstq+strideq*2+ 0], m0
mova [dstq+strideq*2+16], m1
mova [dstq+stride3q + 0], m0
mova [dstq+stride3q +16], m1
lea dstq, [dstq+strideq*4]
dec cntd
jg .loop
RET
INIT_XMM sse
cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a
movifnidn aq, amp
mova m0, [aq+mmsize*0]
mova m1, [aq+mmsize*1]
mova m2, [aq+mmsize*2]
mova m3, [aq+mmsize*3]
DEFINE_ARGS dst, stride, cnt
mov cntd, 16
.loop:
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m1
mova [dstq+strideq*0+32], m2
mova [dstq+strideq*0+48], m3
mova [dstq+strideq*1+ 0], m0
mova [dstq+strideq*1+16], m1
mova [dstq+strideq*1+32], m2
mova [dstq+strideq*1+48], m3
lea dstq, [dstq+strideq*2]
dec cntd
jg .loop
RET
INIT_MMX mmxext
cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a
mova m3, [lq]
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
pshufw m0, m3, q3333
pshufw m1, m3, q2222
pshufw m2, m3, q1111
pshufw m3, m3, q0000
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
mova [dstq+strideq*2], m2
mova [dstq+stride3q ], m3
RET
INIT_XMM sse2
cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a
mova m2, [lq]
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
punpckhwd m3, m2, m2
pshufd m0, m3, q3333
pshufd m1, m3, q2222
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
pshufd m0, m3, q1111
pshufd m1, m3, q0000
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m1
lea dstq, [dstq+strideq*4]
punpcklwd m2, m2
pshufd m0, m2, q3333
pshufd m1, m2, q2222
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
pshufd m0, m2, q1111
pshufd m1, m2, q0000
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m1
RET
INIT_XMM sse2
cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt
mov cntd, 3
lea stride3q, [strideq*3]
.loop:
movh m3, [lq+cntq*8]
punpcklwd m3, m3
pshufd m0, m3, q3333
pshufd m1, m3, q2222
pshufd m2, m3, q1111
pshufd m3, m3, q0000
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m0
mova [dstq+strideq*1+ 0], m1
mova [dstq+strideq*1+16], m1
mova [dstq+strideq*2+ 0], m2
mova [dstq+strideq*2+16], m2
mova [dstq+stride3q + 0], m3
mova [dstq+stride3q +16], m3
lea dstq, [dstq+strideq*4]
dec cntd
jge .loop
RET
INIT_XMM sse2
cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt
mov cntd, 7
lea stride3q, [strideq*3]
.loop:
movh m3, [lq+cntq*8]
punpcklwd m3, m3
pshufd m0, m3, q3333
pshufd m1, m3, q2222
pshufd m2, m3, q1111
pshufd m3, m3, q0000
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m0
mova [dstq+strideq*0+32], m0
mova [dstq+strideq*0+48], m0
mova [dstq+strideq*1+ 0], m1
mova [dstq+strideq*1+16], m1
mova [dstq+strideq*1+32], m1
mova [dstq+strideq*1+48], m1
mova [dstq+strideq*2+ 0], m2
mova [dstq+strideq*2+16], m2
mova [dstq+strideq*2+32], m2
mova [dstq+strideq*2+48], m2
mova [dstq+stride3q + 0], m3
mova [dstq+stride3q +16], m3
mova [dstq+stride3q +32], m3
mova [dstq+stride3q +48], m3
lea dstq, [dstq+strideq*4]
dec cntd
jge .loop
RET
INIT_MMX mmxext
cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a
mova m0, [lq]
paddw m0, [aq]
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
pmaddwd m0, [pw_1]
pshufw m1, m0, q3232
paddd m0, [pd_4]
paddd m0, m1
psrad m0, 3
pshufw m0, m0, q0000
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
RET
INIT_XMM sse2
cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a
mova m0, [lq]
paddw m0, [aq]
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
pmaddwd m0, [pw_1]
pshufd m1, m0, q3232
paddd m0, m1
pshufd m1, m0, q1111
paddd m0, [pd_8]
paddd m0, m1
psrad m0, 4
pshuflw m0, m0, q0000
punpcklqdq m0, m0
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
RET
INIT_XMM sse2
cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a
mova m0, [lq]
paddw m0, [lq+mmsize]
paddw m0, [aq]
paddw m0, [aq+mmsize]
DEFINE_ARGS dst, stride, stride3, cnt
lea stride3q, [strideq*3]
mov cntd, 4
pmaddwd m0, [pw_1]
pshufd m1, m0, q3232
paddd m0, m1
pshufd m1, m0, q1111
paddd m0, [pd_16]
paddd m0, m1
psrad m0, 5
pshuflw m0, m0, q0000
punpcklqdq m0, m0
.loop:
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m0
mova [dstq+strideq*1+ 0], m0
mova [dstq+strideq*1+16], m0
mova [dstq+strideq*2+ 0], m0
mova [dstq+strideq*2+16], m0
mova [dstq+stride3q + 0], m0
mova [dstq+stride3q +16], m0
lea dstq, [dstq+strideq*4]
dec cntd
jg .loop
RET
INIT_XMM sse2
cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a
mova m0, [lq+mmsize*0]
paddw m0, [lq+mmsize*1]
paddw m0, [lq+mmsize*2]
paddw m0, [lq+mmsize*3]
paddw m0, [aq+mmsize*0]
paddw m0, [aq+mmsize*1]
paddw m0, [aq+mmsize*2]
paddw m0, [aq+mmsize*3]
DEFINE_ARGS dst, stride, stride3, cnt
lea stride3q, [strideq*3]
mov cntd, 16
pmaddwd m0, [pw_1]
pshufd m1, m0, q3232
paddd m0, m1
pshufd m1, m0, q1111
paddd m0, [pd_32]
paddd m0, m1
psrad m0, 6
pshuflw m0, m0, q0000
punpcklqdq m0, m0
.loop:
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m0
mova [dstq+strideq*0+32], m0
mova [dstq+strideq*0+48], m0
mova [dstq+strideq*1+ 0], m0
mova [dstq+strideq*1+16], m0
mova [dstq+strideq*1+32], m0
mova [dstq+strideq*1+48], m0
lea dstq, [dstq+strideq*2]
dec cntd
jg .loop
RET
%macro DC_1D_FNS 2
INIT_MMX mmxext
cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a
mova m0, [%2]
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
pmaddwd m0, [pw_1]
pshufw m1, m0, q3232
paddd m0, [pd_2]
paddd m0, m1
psrad m0, 2
pshufw m0, m0, q0000
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
RET
INIT_XMM sse2
cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a
mova m0, [%2]
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
pmaddwd m0, [pw_1]
pshufd m1, m0, q3232
paddd m0, m1
pshufd m1, m0, q1111
paddd m0, [pd_4]
paddd m0, m1
psrad m0, 3
pshuflw m0, m0, q0000
punpcklqdq m0, m0
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
RET
INIT_XMM sse2
cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a
mova m0, [%2]
paddw m0, [%2+mmsize]
DEFINE_ARGS dst, stride, stride3, cnt
lea stride3q, [strideq*3]
mov cntd, 4
pmaddwd m0, [pw_1]
pshufd m1, m0, q3232
paddd m0, m1
pshufd m1, m0, q1111
paddd m0, [pd_8]
paddd m0, m1
psrad m0, 4
pshuflw m0, m0, q0000
punpcklqdq m0, m0
.loop:
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m0
mova [dstq+strideq*1+ 0], m0
mova [dstq+strideq*1+16], m0
mova [dstq+strideq*2+ 0], m0
mova [dstq+strideq*2+16], m0
mova [dstq+stride3q + 0], m0
mova [dstq+stride3q +16], m0
lea dstq, [dstq+strideq*4]
dec cntd
jg .loop
RET
INIT_XMM sse2
cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a
mova m0, [%2+mmsize*0]
paddw m0, [%2+mmsize*1]
paddw m0, [%2+mmsize*2]
paddw m0, [%2+mmsize*3]
DEFINE_ARGS dst, stride, cnt
mov cntd, 16
pmaddwd m0, [pw_1]
pshufd m1, m0, q3232
paddd m0, m1
pshufd m1, m0, q1111
paddd m0, [pd_16]
paddd m0, m1
psrad m0, 5
pshuflw m0, m0, q0000
punpcklqdq m0, m0
.loop:
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m0
mova [dstq+strideq*0+32], m0
mova [dstq+strideq*0+48], m0
mova [dstq+strideq*1+ 0], m0
mova [dstq+strideq*1+16], m0
mova [dstq+strideq*1+32], m0
mova [dstq+strideq*1+48], m0
lea dstq, [dstq+strideq*2]
dec cntd
jg .loop
RET
%endmacro
DC_1D_FNS top, aq
DC_1D_FNS left, lq
INIT_MMX mmxext
cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a
mova m5, [pw_1023]
.body:
mova m4, [aq]
mova m3, [lq]
movd m0, [aq-4]
pshufw m0, m0, q1111
psubw m4, m0
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
pshufw m0, m3, q3333
pshufw m1, m3, q2222
pshufw m2, m3, q1111
pshufw m3, m3, q0000
paddw m0, m4
paddw m1, m4
paddw m2, m4
paddw m3, m4
pxor m4, m4
pmaxsw m0, m4
pmaxsw m1, m4
pmaxsw m2, m4
pmaxsw m3, m4
pminsw m0, m5
pminsw m1, m5
pminsw m2, m5
pminsw m3, m5
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
mova [dstq+strideq*2], m2
mova [dstq+stride3q ], m3
RET
cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a
mova m5, [pw_4095]
jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body
INIT_XMM sse2
cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a
mova m4, [pw_1023]
.body:
pxor m6, m6
mova m5, [aq]
movd m0, [aq-4]
pshuflw m0, m0, q1111
punpcklqdq m0, m0
psubw m5, m0
DEFINE_ARGS dst, stride, l, stride3, cnt
lea stride3q, [strideq*3]
mov cntd, 1
.loop:
movh m3, [lq+cntq*8]
punpcklwd m3, m3
pshufd m0, m3, q3333
pshufd m1, m3, q2222
pshufd m2, m3, q1111
pshufd m3, m3, q0000
paddw m0, m5
paddw m1, m5
paddw m2, m5
paddw m3, m5
pmaxsw m0, m6
pmaxsw m1, m6
pmaxsw m2, m6
pmaxsw m3, m6
pminsw m0, m4
pminsw m1, m4
pminsw m2, m4
pminsw m3, m4
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
mova [dstq+strideq*2], m2
mova [dstq+stride3q ], m3
lea dstq, [dstq+strideq*4]
dec cntd
jge .loop
RET
cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a
mova m4, [pw_4095]
jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body
INIT_XMM sse2
cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a
mova m7, [pw_1023]
.body:
pxor m6, m6
mova m4, [aq]
mova m5, [aq+mmsize]
movd m0, [aq-4]
pshuflw m0, m0, q1111
punpcklqdq m0, m0
psubw m4, m0
psubw m5, m0
DEFINE_ARGS dst, stride, l, cnt
mov cntd, 7
.loop:
movd m3, [lq+cntq*4]
punpcklwd m3, m3
pshufd m2, m3, q1111
pshufd m3, m3, q0000
paddw m0, m2, m4
paddw m2, m5
paddw m1, m3, m4
paddw m3, m5
pmaxsw m0, m6
pmaxsw m2, m6
pmaxsw m1, m6
pmaxsw m3, m6
pminsw m0, m7
pminsw m2, m7
pminsw m1, m7
pminsw m3, m7
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m2
mova [dstq+strideq*1+ 0], m1
mova [dstq+strideq*1+16], m3
lea dstq, [dstq+strideq*2]
dec cntd
jge .loop
RET
cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a
mova m7, [pw_4095]
jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body
INIT_XMM sse2
cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
mova m0, [pw_1023]
.body:
pxor m1, m1
%if ARCH_X86_64
SWAP 0, 8
SWAP 1, 9
%define reg_min m9
%define reg_max m8
%else
mova [rsp+ 0], m0
mova [rsp+16], m1
%define reg_min [rsp+16]
%define reg_max [rsp+ 0]
%endif
mova m4, [aq+mmsize*0]
mova m5, [aq+mmsize*1]
mova m6, [aq+mmsize*2]
mova m7, [aq+mmsize*3]
movd m0, [aq-4]
pshuflw m0, m0, q1111
punpcklqdq m0, m0
psubw m4, m0
psubw m5, m0
psubw m6, m0
psubw m7, m0
DEFINE_ARGS dst, stride, l, cnt
mov cntd, 31
.loop:
pinsrw m3, [lq+cntq*2], 0
punpcklwd m3, m3
pshufd m3, m3, q0000
paddw m0, m3, m4
paddw m1, m3, m5
paddw m2, m3, m6
paddw m3, m7
pmaxsw m0, reg_min
pmaxsw m1, reg_min
pmaxsw m2, reg_min
pmaxsw m3, reg_min
pminsw m0, reg_max
pminsw m1, reg_max
pminsw m2, reg_max
pminsw m3, reg_max
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m1
mova [dstq+strideq*0+32], m2
mova [dstq+strideq*0+48], m3
add dstq, strideq
dec cntd
jge .loop
RET
cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
mova m0, [pw_4095]
jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body
; Directional intra predicion functions
;
; in the functions below, 'abcdefgh' refers to above data (sometimes simply
; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply
; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered
; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered
; top-left data.
; left=(left+2*center+right+2)>>2
%macro LOWPASS 3 ; left [dst], center, right
paddw m%1, m%3
psraw m%1, 1
pavgw m%1, m%2
%endmacro
; abcdefgh (src) -> bcdefghh (dst)
; dst/src can be the same register
%macro SHIFT_RIGHT 2-3 [pb_2to15_14_15] ; dst, src, [ssse3_shift_reg]
%if cpuflag(ssse3)
pshufb %1, %2, %3 ; abcdefgh -> bcdefghh
%else
psrldq %1, %2, 2 ; abcdefgh -> bcdefgh.
pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh
%endif
%endmacro
; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2)
%macro SHIFT_RIGHTx2 3-4 [pb_2to15_14_15] ; dst1, dst2, src, [ssse3_shift_reg]
%if cpuflag(ssse3)
pshufb %1, %3, %4 ; abcdefgh -> bcdefghh
pshufb %2, %1, %4 ; bcdefghh -> cdefghhh
%else
psrldq %1, %3, 2 ; abcdefgh -> bcdefgh.
psrldq %2, %3, 4 ; abcdefgh -> cdefgh..
pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh
pshufhw %2, %2, q1110 ; cdefgh.. -> cdefghhh
%endif
%endmacro
%macro DL_FUNCS 0
cglobal vp9_ipred_dl_4x4_16, 2, 4, 3, dst, stride, l, a
movifnidn aq, amp
movu m1, [aq] ; abcdefgh
pshufhw m0, m1, q3310 ; abcdefhh
SHIFT_RIGHT m1, m1 ; bcdefghh
psrldq m2, m1, 2 ; cdefghh.
LOWPASS 0, 1, 2 ; BCDEFGh.
pshufd m1, m0, q3321 ; DEFGh...
movh [dstq+strideq*0], m0
movh [dstq+strideq*2], m1
add dstq, strideq
psrldq m0, 2 ; CDEFGh..
psrldq m1, 2 ; EFGh....
movh [dstq+strideq*0], m0
movh [dstq+strideq*2], m1
RET
cglobal vp9_ipred_dl_8x8_16, 2, 4, 5, dst, stride, l, a
movifnidn aq, amp
mova m0, [aq] ; abcdefgh
%if cpuflag(ssse3)
mova m4, [pb_2to15_14_15]
%endif
SHIFT_RIGHTx2 m1, m2, m0, m4 ; bcdefghh/cdefghhh
LOWPASS 0, 1, 2 ; BCDEFGHh
shufps m1, m0, m2, q3332 ; FGHhhhhh
shufps m3, m0, m1, q2121 ; DEFGHhhh
DEFINE_ARGS dst, stride, stride5
lea stride5q, [strideq*5]
mova [dstq+strideq*0], m0
mova [dstq+strideq*4], m1
SHIFT_RIGHT m0, m0, m4 ; CDEFGHhh
pshuflw m1, m1, q3321 ; GHhhhhhh
pshufd m2, m0, q3321 ; EFGHhhhh
mova [dstq+strideq*1], m0
mova [dstq+stride5q ], m1
lea dstq, [dstq+strideq*2]
pshuflw m1, m1, q3321 ; Hhhhhhhh
mova [dstq+strideq*0], m3
mova [dstq+strideq*4], m1
pshuflw m1, m1, q3321 ; hhhhhhhh
mova [dstq+strideq*1], m2
mova [dstq+stride5q ], m1
RET
cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
movifnidn aq, amp
mova m0, [aq] ; abcdefgh
mova m3, [aq+mmsize] ; ijklmnop
PALIGNR m1, m3, m0, 2, m4 ; bcdefghi
PALIGNR m2, m3, m0, 4, m4 ; cdefghij
LOWPASS 0, 1, 2 ; BCDEFGHI
%if cpuflag(ssse3)
mova m4, [pb_2to15_14_15]
%endif
SHIFT_RIGHTx2 m2, m1, m3, m4 ; jklmnopp/klmnoppp
LOWPASS 1, 2, 3 ; JKLMNOPp
pshufd m2, m2, q3333 ; pppppppp
DEFINE_ARGS dst, stride, cnt
mov cntd, 8
.loop:
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m1
mova [dstq+strideq*8+ 0], m1
mova [dstq+strideq*8+16], m2
add dstq, strideq
%if cpuflag(avx)
vpalignr m0, m1, m0, 2
%else
PALIGNR m3, m1, m0, 2, m4
mova m0, m3
%endif
SHIFT_RIGHT m1, m1, m4
dec cntd
jg .loop
RET
cglobal vp9_ipred_dl_32x32_16, 2, 5, 7, dst, stride, l, a
movifnidn aq, amp
mova m0, [aq+mmsize*0] ; abcdefgh
mova m1, [aq+mmsize*1] ; ijklmnop
mova m2, [aq+mmsize*2] ; qrstuvwx
mova m3, [aq+mmsize*3] ; yz012345
PALIGNR m4, m1, m0, 2, m6
PALIGNR m5, m1, m0, 4, m6
LOWPASS 0, 4, 5 ; BCDEFGHI
PALIGNR m4, m2, m1, 2, m6
PALIGNR m5, m2, m1, 4, m6
LOWPASS 1, 4, 5 ; JKLMNOPQ
PALIGNR m4, m3, m2, 2, m6
PALIGNR m5, m3, m2, 4, m6
LOWPASS 2, 4, 5 ; RSTUVWXY
%if cpuflag(ssse3)
mova m6, [pb_2to15_14_15]
%endif
SHIFT_RIGHTx2 m4, m5, m3, m6
LOWPASS 3, 4, 5 ; Z0123455
pshufd m4, m4, q3333 ; 55555555
DEFINE_ARGS dst, stride, stride8, stride24, cnt
mov cntd, 8
lea stride8q, [strideq*8]
lea stride24q, [stride8q*3]
.loop:
mova [dstq+stride8q*0+ 0], m0
mova [dstq+stride8q*0+16], m1
mova [dstq+stride8q*0+32], m2
mova [dstq+stride8q*0+48], m3
mova [dstq+stride8q*1+ 0], m1
mova [dstq+stride8q*1+16], m2
mova [dstq+stride8q*1+32], m3
mova [dstq+stride8q*1+48], m4
mova [dstq+stride8q*2+ 0], m2
mova [dstq+stride8q*2+16], m3
mova [dstq+stride8q*2+32], m4
mova [dstq+stride8q*2+48], m4
mova [dstq+stride24q + 0], m3
mova [dstq+stride24q +16], m4
mova [dstq+stride24q +32], m4
mova [dstq+stride24q +48], m4
add dstq, strideq
%if cpuflag(avx)
vpalignr m0, m1, m0, 2
vpalignr m1, m2, m1, 2
vpalignr m2, m3, m2, 2
%else
PALIGNR m5, m1, m0, 2, m6
mova m0, m5
PALIGNR m5, m2, m1, 2, m6
mova m1, m5
PALIGNR m5, m3, m2, 2, m6
mova m2, m5
%endif
SHIFT_RIGHT m3, m3, m6
dec cntd
jg .loop
RET
%endmacro
INIT_XMM sse2
DL_FUNCS
INIT_XMM ssse3
DL_FUNCS
INIT_XMM avx
DL_FUNCS
%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
movh m0, [lq] ; wxyz....
movhps m0, [aq-2] ; wxyz*abc
movd m1, [aq+6] ; d.......
PALIGNR m1, m0, 2, m2 ; xyz*abcd
psrldq m2, m1, 2 ; yz*abcd.
LOWPASS 0, 1, 2 ; XYZ#ABC.
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
movh [dstq+stride3q ], m0
psrldq m0, 2 ; YZ#ABC..
movh [dstq+strideq*2], m0
psrldq m0, 2 ; Z#ABC...
movh [dstq+strideq*1], m0
psrldq m0, 2 ; #ABC....
movh [dstq+strideq*0], m0
RET
cglobal vp9_ipred_dr_8x8_16, 4, 4, 5, dst, stride, l, a
mova m0, [lq] ; stuvwxyz
movu m1, [aq-2] ; *abcdefg
mova m2, [aq] ; abcdefgh
psrldq m3, m2, 2 ; bcdefgh.
LOWPASS 3, 2, 1 ; ABCDEFG.
PALIGNR m1, m0, 2, m4 ; tuvwxyz*
PALIGNR m2, m1, 2, m4 ; uvwxyz*a
LOWPASS 2, 1, 0 ; TUVWXYZ#
DEFINE_ARGS dst, stride, dst4, stride3
lea stride3q, [strideq*3]
lea dst4q, [dstq+strideq*4]
movhps [dstq +stride3q +0], m2
movh [dstq+ stride3q +8], m3
mova [dst4q+stride3q +0], m2
PALIGNR m1, m3, m2, 2, m0
psrldq m3, 2
movhps [dstq +strideq*2+0], m1
movh [dstq+ strideq*2+8], m3
mova [dst4q+strideq*2+0], m1
PALIGNR m2, m3, m1, 2, m0
psrldq m3, 2
movhps [dstq +strideq*1+0], m2
movh [dstq+ strideq*1+8], m3
mova [dst4q+strideq*1+0], m2
PALIGNR m1, m3, m2, 2, m0
psrldq m3, 2
movhps [dstq +strideq*0+0], m1
movh [dstq+ strideq*0+8], m3
mova [dst4q+strideq*0+0], m1
RET
cglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a
mova m0, [lq] ; klmnopqr
mova m1, [lq+mmsize] ; stuvwxyz
movu m2, [aq-2] ; *abcdefg
movu m3, [aq+mmsize-2] ; hijklmno
mova m4, [aq] ; abcdefgh
mova m5, [aq+mmsize] ; ijklmnop
psrldq m6, m5, 2 ; jklmnop.
LOWPASS 6, 5, 3 ; IJKLMNO.
PALIGNR m5, m4, 2, m3 ; bcdefghi
LOWPASS 5, 4, 2 ; ABCDEFGH
PALIGNR m2, m1, 2, m3 ; tuvwxyz*
PALIGNR m4, m2, 2, m3 ; uvwxyz*a
LOWPASS 4, 2, 1 ; TUVWXYZ#
PALIGNR m1, m0, 2, m3 ; lmnopqrs
PALIGNR m2, m1, 2, m3 ; mnopqrst
LOWPASS 2, 1, 0 ; LMNOPQRS
DEFINE_ARGS dst, stride, dst8, cnt
lea dst8q, [dstq+strideq*8]
mov cntd, 8
.loop:
sub dst8q, strideq
mova [dst8q+strideq*0+ 0], m4
mova [dst8q+strideq*0+16], m5
mova [dst8q+strideq*8+ 0], m2
mova [dst8q+strideq*8+16], m4
%if cpuflag(avx)
vpalignr m2, m4, m2, 2
vpalignr m4, m5, m4, 2
vpalignr m5, m6, m5, 2
%else
PALIGNR m0, m4, m2, 2, m1
mova m2, m0
PALIGNR m0, m5, m4, 2, m1
mova m4, m0
PALIGNR m0, m6, m5, 2, m1
mova m5, m0
%endif
psrldq m6, 2
dec cntd
jg .loop
RET
cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \
%1 * ARCH_X86_32 * -mmsize, dst, stride, l, a
mova m0, [aq+mmsize*3] ; a[24-31]
movu m1, [aq+mmsize*3-2] ; a[23-30]
psrldq m2, m0, 2 ; a[25-31].
LOWPASS 2, 0, 1 ; A[24-30].
mova m1, [aq+mmsize*2] ; a[16-23]
movu m3, [aq+mmsize*2-2] ; a[15-22]
PALIGNR m0, m1, 2, m4 ; a[17-24]
LOWPASS 0, 1, 3 ; A[16-23]
mova m3, [aq+mmsize*1] ; a[8-15]
movu m4, [aq+mmsize*1-2] ; a[7-14]
PALIGNR m1, m3, 2, m5 ; a[9-16]
LOWPASS 1, 3, 4 ; A[8-15]
mova m4, [aq+mmsize*0] ; a[0-7]
movu m5, [aq+mmsize*0-2] ; *a[0-6]
PALIGNR m3, m4, 2, m6 ; a[1-8]
LOWPASS 3, 4, 5 ; A[0-7]
SCRATCH 1, 8, rsp+0*mmsize
SCRATCH 3, 9, rsp+1*mmsize
%if notcpuflag(ssse3)
SCRATCH 0, 10, rsp+2*mmsize
%endif
mova m6, [lq+mmsize*3] ; l[24-31]
PALIGNR m5, m6, 2, m0 ; l[25-31]*
PALIGNR m4, m5, 2, m0 ; l[26-31]*a
LOWPASS 4, 5, 6 ; L[25-31]#
mova m7, [lq+mmsize*2] ; l[16-23]
PALIGNR m6, m7, 2, m0 ; l[17-24]
PALIGNR m5, m6, 2, m0 ; l[18-25]
LOWPASS 5, 6, 7 ; L[17-24]
mova m1, [lq+mmsize*1] ; l[8-15]
PALIGNR m7, m1, 2, m0 ; l[9-16]
PALIGNR m6, m7, 2, m0 ; l[10-17]
LOWPASS 6, 7, 1 ; L[9-16]
mova m3, [lq+mmsize*0] ; l[0-7]
PALIGNR m1, m3, 2, m0 ; l[1-8]
PALIGNR m7, m1, 2, m0 ; l[2-9]
LOWPASS 7, 1, 3 ; L[1-8]
%if cpuflag(ssse3)
%if cpuflag(avx)
UNSCRATCH 1, 8, rsp+0*mmsize
%endif
UNSCRATCH 3, 9, rsp+1*mmsize
%else
UNSCRATCH 0, 10, rsp+2*mmsize
%endif
DEFINE_ARGS dst8, stride, stride8, stride24, cnt
lea stride8q, [strideq*8]
lea stride24q, [stride8q*3]
lea dst8q, [dst8q+strideq*8]
mov cntd, 8
.loop:
sub dst8q, strideq
%if notcpuflag(avx)
UNSCRATCH 1, 8, rsp+0*mmsize
%if notcpuflag(ssse3)
UNSCRATCH 3, 9, rsp+1*mmsize
%endif
%endif
mova [dst8q+stride8q*0+ 0], m4
mova [dst8q+stride8q*0+16], m3
mova [dst8q+stride8q*0+32], m1
mova [dst8q+stride8q*0+48], m0
mova [dst8q+stride8q*1+ 0], m5
mova [dst8q+stride8q*1+16], m4
mova [dst8q+stride8q*1+32], m3
mova [dst8q+stride8q*1+48], m1
mova [dst8q+stride8q*2+ 0], m6
mova [dst8q+stride8q*2+16], m5
mova [dst8q+stride8q*2+32], m4
mova [dst8q+stride8q*2+48], m3
mova [dst8q+stride24q + 0], m7
mova [dst8q+stride24q +16], m6
mova [dst8q+stride24q +32], m5
mova [dst8q+stride24q +48], m4
%if cpuflag(avx)
vpalignr m7, m6, m7, 2
vpalignr m6, m5, m6, 2
vpalignr m5, m4, m5, 2
vpalignr m4, m3, m4, 2
vpalignr m3, m1, m3, 2
vpalignr m1, m0, m1, 2
vpalignr m0, m2, m0, 2
%else
SCRATCH 2, 8, rsp+0*mmsize
%if notcpuflag(ssse3)
SCRATCH 0, 9, rsp+1*mmsize
%endif
PALIGNR m2, m6, m7, 2, m0
mova m7, m2
PALIGNR m2, m5, m6, 2, m0
mova m6, m2
PALIGNR m2, m4, m5, 2, m0
mova m5, m2
PALIGNR m2, m3, m4, 2, m0
mova m4, m2
PALIGNR m2, m1, m3, 2, m0
mova m3, m2
%if notcpuflag(ssse3)
UNSCRATCH 0, 9, rsp+1*mmsize
SCRATCH 3, 9, rsp+1*mmsize
%endif
PALIGNR m2, m0, m1, 2, m3
mova m1, m2
UNSCRATCH 2, 8, rsp+0*mmsize
SCRATCH 1, 8, rsp+0*mmsize
PALIGNR m1, m2, m0, 2, m3
mova m0, m1
%endif
psrldq m2, 2
dec cntd
jg .loop
RET
%endmacro
INIT_XMM sse2
DR_FUNCS 3
INIT_XMM ssse3
DR_FUNCS 2
INIT_XMM avx
DR_FUNCS 2
%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
movifnidn aq, amp
movu m0, [aq] ; abcdefgh
psrldq m1, m0, 2 ; bcdefgh.
psrldq m2, m0, 4 ; cdefgh..
LOWPASS 2, 1, 0 ; BCDEFGH.
pavgw m1, m0 ; ABCDEFG.
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
movh [dstq+strideq*0], m1
movh [dstq+strideq*1], m2
psrldq m1, 2
psrldq m2, 2
movh [dstq+strideq*2], m1
movh [dstq+stride3q ], m2
RET
cglobal vp9_ipred_vl_8x8_16, 2, 4, 4, dst, stride, l, a
movifnidn aq, amp
mova m0, [aq] ; abcdefgh
%if cpuflag(ssse3)
mova m3, [pb_2to15_14_15]
%endif
SHIFT_RIGHTx2 m1, m2, m0, m3 ; bcdefghh/cdefghhh
LOWPASS 2, 1, 0 ; BCDEFGHh
pavgw m1, m0 ; ABCDEFGh
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
mova [dstq+strideq*0], m1
mova [dstq+strideq*1], m2
SHIFT_RIGHT m1, m1, m3
SHIFT_RIGHT m2, m2, m3
mova [dstq+strideq*2], m1
mova [dstq+stride3q ], m2
lea dstq, [dstq+strideq*4]
SHIFT_RIGHT m1, m1, m3
SHIFT_RIGHT m2, m2, m3
mova [dstq+strideq*0], m1
mova [dstq+strideq*1], m2
SHIFT_RIGHT m1, m1, m3
SHIFT_RIGHT m2, m2, m3
mova [dstq+strideq*2], m1
mova [dstq+stride3q ], m2
RET
cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a
movifnidn aq, amp
mova m0, [aq]
mova m1, [aq+mmsize]
PALIGNR m2, m1, m0, 2, m3
PALIGNR m3, m1, m0, 4, m4
LOWPASS 3, 2, 0
pavgw m2, m0
%if cpuflag(ssse3)
mova m4, [pb_2to15_14_15]
%endif
SHIFT_RIGHTx2 m5, m0, m1, m4
LOWPASS 0, 5, 1
pavgw m1, m5
DEFINE_ARGS dst, stride, cnt
mov cntd, 8
.loop:
mova [dstq+strideq*0+ 0], m2
mova [dstq+strideq*0+16], m1
mova [dstq+strideq*1+ 0], m3
mova [dstq+strideq*1+16], m0
lea dstq, [dstq+strideq*2]
%if cpuflag(avx)
vpalignr m2, m1, m2, 2
vpalignr m3, m0, m3, 2
%else
PALIGNR m5, m1, m2, 2, m4
mova m2, m5
PALIGNR m5, m0, m3, 2, m4
mova m3, m5
%endif
SHIFT_RIGHT m1, m1, m4
SHIFT_RIGHT m0, m0, m4
dec cntd
jg .loop
RET
cglobal vp9_ipred_vl_32x32_16, 2, 5, 11, %1 * mmsize * ARCH_X86_32, dst, stride, l, a
movifnidn aq, amp
mova m0, [aq+mmsize*0]
mova m1, [aq+mmsize*1]
mova m2, [aq+mmsize*2]
PALIGNR m6, m1, m0, 2, m5
PALIGNR m7, m1, m0, 4, m5
LOWPASS 7, 6, 0
pavgw m6, m0
SCRATCH 6, 8, rsp+0*mmsize
PALIGNR m4, m2, m1, 2, m0
PALIGNR m5, m2, m1, 4, m0
LOWPASS 5, 4, 1
pavgw m4, m1
mova m0, [aq+mmsize*3]
PALIGNR m1, m0, m2, 2, m6
PALIGNR m3, m0, m2, 4, m6
LOWPASS 3, 1, 2
pavgw m2, m1
%if cpuflag(ssse3)
PRELOAD 10, pb_2to15_14_15, shuf
%endif
SHIFT_RIGHTx2 m6, m1, m0, reg_shuf
LOWPASS 1, 6, 0
pavgw m0, m6
%if ARCH_X86_64
pshufd m9, m6, q3333
%endif
%if cpuflag(avx)
UNSCRATCH 6, 8, rsp+0*mmsize
%endif
DEFINE_ARGS dst, stride, cnt, stride16, stride17
mov stride16q, strideq
mov cntd, 8
shl stride16q, 4
lea stride17q, [stride16q+strideq]
; FIXME m8 is unused for avx, so we could save one register here for win64
.loop:
%if notcpuflag(avx)
UNSCRATCH 6, 8, rsp+0*mmsize
%endif
mova [dstq+strideq*0+ 0], m6
mova [dstq+strideq*0+16], m4
mova [dstq+strideq*0+32], m2
mova [dstq+strideq*0+48], m0
mova [dstq+strideq*1+ 0], m7
mova [dstq+strideq*1+16], m5
mova [dstq+strideq*1+32], m3
mova [dstq+strideq*1+48], m1
mova [dstq+stride16q+ 0], m4
mova [dstq+stride16q+16], m2
mova [dstq+stride16q+32], m0
%if ARCH_X86_64
mova [dstq+stride16q+48], m9
%endif
mova [dstq+stride17q+ 0], m5
mova [dstq+stride17q+16], m3
mova [dstq+stride17q+32], m1
%if ARCH_X86_64
mova [dstq+stride17q+48], m9
%endif
lea dstq, [dstq+strideq*2]
%if cpuflag(avx)
vpalignr m6, m4, m6, 2
vpalignr m4, m2, m4, 2
vpalignr m2, m0, m2, 2
vpalignr m7, m5, m7, 2
vpalignr m5, m3, m5, 2
vpalignr m3, m1, m3, 2
%else
SCRATCH 3, 8, rsp+0*mmsize
%if notcpuflag(ssse3)
SCRATCH 1, 10, rsp+1*mmsize
%endif
PALIGNR m3, m4, m6, 2, m1
mova m6, m3
PALIGNR m3, m2, m4, 2, m1
mova m4, m3
PALIGNR m3, m0, m2, 2, m1
mova m2, m3
PALIGNR m3, m5, m7, 2, m1
mova m7, m3
UNSCRATCH 3, 8, rsp+0*mmsize
SCRATCH 6, 8, rsp+0*mmsize
%if notcpuflag(ssse3)
UNSCRATCH 1, 10, rsp+1*mmsize
SCRATCH 7, 10, rsp+1*mmsize
%endif
PALIGNR m6, m3, m5, 2, m7
mova m5, m6
PALIGNR m6, m1, m3, 2, m7
mova m3, m6
%if notcpuflag(ssse3)
UNSCRATCH 7, 10, rsp+1*mmsize
%endif
%endif
SHIFT_RIGHT m1, m1, reg_shuf
SHIFT_RIGHT m0, m0, reg_shuf
dec cntd
jg .loop
%if ARCH_X86_32
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
%assign %%n 0
%rep 4
mova [dstq+strideq*0+48], m0
mova [dstq+strideq*1+48], m0
mova [dstq+strideq*2+48], m0
mova [dstq+stride3q +48], m0
%if %%n < 3
lea dstq, [dstq+strideq*4]
%endif
%assign %%n (%%n+1)
%endrep
%endif
RET
%endmacro
INIT_XMM sse2
VL_FUNCS 2
INIT_XMM ssse3
VL_FUNCS 1
INIT_XMM avx
VL_FUNCS 1
%macro VR_FUNCS 0
cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a
movu m0, [aq-2]
movhps m1, [lq]
PALIGNR m0, m1, 10, m2 ; xyz*abcd
pslldq m1, m0, 2 ; .xyz*abc
pslldq m2, m0, 4 ; ..xyz*ab
LOWPASS 2, 1, 0 ; ..YZ#ABC
pavgw m1, m0 ; ....#ABC
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
movhps [dstq+strideq*0], m1
movhps [dstq+strideq*1], m2
shufps m0, m2, m1, q3210
%if cpuflag(ssse3)
pshufb m2, [pb_4_5_8to13_8x0]
%else
pshuflw m2, m2, q2222
psrldq m2, 6
%endif
psrldq m0, 6
movh [dstq+strideq*2], m0
movh [dstq+stride3q ], m2
RET
cglobal vp9_ipred_vr_8x8_16, 4, 4, 5, dst, stride, l, a
movu m1, [aq-2] ; *abcdefg
movu m2, [lq] ; stuvwxyz
mova m0, [aq] ; abcdefgh
PALIGNR m3, m1, m2, 14, m4 ; z*abcdef
LOWPASS 3, 1, 0
pavgw m0, m1
PALIGNR m1, m2, 2, m4 ; tuvwxyz*
pslldq m4, m2, 2 ; .stuvwxy
LOWPASS 4, 2, 1
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m3
PALIGNR m0, m4, 14, m1
pslldq m4, 2
PALIGNR m3, m4, 14, m1
pslldq m4, 2
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m3
lea dstq, [dstq+strideq*4]
PALIGNR m0, m4, 14, m1
pslldq m4, 2
PALIGNR m3, m4, 14, m1
pslldq m4, 2
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m3
PALIGNR m0, m4, 14, m1
pslldq m4, 2
PALIGNR m3, m4, 14, m4
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m3
RET
cglobal vp9_ipred_vr_16x16_16, 4, 4, 8, dst, stride, l, a
movu m1, [aq-2] ; *abcdefg
movu m2, [aq+mmsize-2] ; hijklmno
mova m3, [aq] ; abcdefgh
mova m4, [aq+mmsize] ; ijklmnop
mova m5, [lq+mmsize] ; stuvwxyz
PALIGNR m0, m1, m5, 14, m6 ; z*abcdef
movu m6, [aq+mmsize-4] ; ghijklmn
LOWPASS 6, 2, 4
pavgw m2, m4
LOWPASS 0, 1, 3
pavgw m3, m1
PALIGNR m1, m5, 2, m7 ; tuvwxyz*
movu m7, [lq+mmsize-2] ; rstuvwxy
LOWPASS 1, 5, 7
movu m5, [lq+2] ; lmnopqrs
pslldq m4, m5, 2 ; .lmnopqr
pslldq m7, m5, 4 ; ..lmnopq
LOWPASS 5, 4, 7
psrld m4, m1, 16
psrld m7, m5, 16
pand m1, [pd_65535]
pand m5, [pd_65535]
packssdw m7, m4
packssdw m5, m1
DEFINE_ARGS dst, stride, cnt
mov cntd, 8
.loop:
mova [dstq+strideq*0+ 0], m3
mova [dstq+strideq*0+16], m2
mova [dstq+strideq*1+ 0], m0
mova [dstq+strideq*1+16], m6
lea dstq, [dstq+strideq*2]
PALIGNR m2, m3, 14, m4
PALIGNR m3, m7, 14, m4
pslldq m7, 2
PALIGNR m6, m0, 14, m4
PALIGNR m0, m5, 14, m4
pslldq m5, 2
dec cntd
jg .loop
RET
cglobal vp9_ipred_vr_32x32_16, 4, 5, 14, 6 * mmsize * ARCH_X86_32, dst, stride, l, a
movu m0, [aq+mmsize*0-2] ; *a[0-6]
movu m1, [aq+mmsize*1-2] ; a[7-14]
movu m2, [aq+mmsize*2-2] ; a[15-22]
movu m3, [aq+mmsize*3-2] ; a[23-30]
mova m4, [aq+mmsize*3+0] ; a[24-31]
movu m5, [aq+mmsize*3-4] ; a[22-29]
LOWPASS 5, 3, 4 ; A[23-30]
SCRATCH 5, 8, rsp+0*mmsize
pavgw m3, m4
mova m4, [aq+mmsize*2+0] ; a[16-23]
movu m6, [aq+mmsize*2-4] ; a[14-21]
LOWPASS 6, 2, 4 ; A[15-22]
SCRATCH 6, 9, rsp+1*mmsize
pavgw m2, m4
mova m4, [aq+mmsize*1+0] ; a[8-15]
movu m7, [aq+mmsize*1-4] ; a[6-13]
LOWPASS 7, 1, 4 ; A[7-14]
SCRATCH 7, 10, rsp+2*mmsize
pavgw m1, m4
mova m4, [aq+mmsize*0+0] ; a[0-7]
mova m5, [lq+mmsize*3+0] ; l[24-31]
PALIGNR m6, m0, m5, 14, m7 ; l[31]*a[0-5]
LOWPASS 6, 0, 4 ; #A[0-6]
SCRATCH 6, 11, rsp+3*mmsize
pavgw m4, m0
PALIGNR m0, m5, 2, m7 ; l[25-31]*
movu m7, [lq+mmsize*3-2] ; l[23-30]
LOWPASS 0, 5, 7 ; L[24-31]
movu m5, [lq+mmsize*2-2] ; l[15-22]
mova m7, [lq+mmsize*2+0] ; l[16-23]
movu m6, [lq+mmsize*2+2] ; l[17-24]
LOWPASS 5, 7, 6 ; L[16-23]
psrld m7, m0, 16
psrld m6, m5, 16
pand m0, [pd_65535]
pand m5, [pd_65535]
packssdw m6, m7
packssdw m5, m0
SCRATCH 5, 12, rsp+4*mmsize
SCRATCH 6, 13, rsp+5*mmsize
movu m6, [lq+mmsize*1-2] ; l[7-14]
mova m0, [lq+mmsize*1+0] ; l[8-15]
movu m5, [lq+mmsize*1+2] ; l[9-16]
LOWPASS 6, 0, 5 ; L[8-15]
movu m0, [lq+mmsize*0+2] ; l[1-8]
pslldq m5, m0, 2 ; .l[1-7]
pslldq m7, m0, 4 ; ..l[1-6]
LOWPASS 0, 5, 7
psrld m5, m6, 16
psrld m7, m0, 16
pand m6, [pd_65535]
pand m0, [pd_65535]
packssdw m7, m5
packssdw m0, m6
UNSCRATCH 6, 13, rsp+5*mmsize
DEFINE_ARGS dst, stride, stride16, cnt, stride17
mov stride16q, strideq
mov cntd, 8
shl stride16q, 4
%if ARCH_X86_64
lea stride17q, [stride16q+strideq]
%endif
.loop:
mova [dstq+strideq*0+ 0], m4
mova [dstq+strideq*0+16], m1
mova [dstq+strideq*0+32], m2
mova [dstq+strideq*0+48], m3
%if ARCH_X86_64
mova [dstq+strideq*1+ 0], m11
mova [dstq+strideq*1+16], m10
mova [dstq+strideq*1+32], m9
mova [dstq+strideq*1+48], m8
%endif
mova [dstq+stride16q+ 0], m6
mova [dstq+stride16q+16], m4
mova [dstq+stride16q+32], m1
mova [dstq+stride16q+48], m2
%if ARCH_X86_64
mova [dstq+stride17q+ 0], m12
mova [dstq+stride17q+16], m11
mova [dstq+stride17q+32], m10
mova [dstq+stride17q+48], m9
%endif
lea dstq, [dstq+strideq*2]
PALIGNR m3, m2, 14, m5
PALIGNR m2, m1, 14, m5
PALIGNR m1, m4, 14, m5
PALIGNR m4, m6, 14, m5
PALIGNR m6, m7, 14, m5
pslldq m7, 2
%if ARCH_X86_64
PALIGNR m8, m9, 14, m5
PALIGNR m9, m10, 14, m5
PALIGNR m10, m11, 14, m5
PALIGNR m11, m12, 14, m5
PALIGNR m12, m0, 14, m5
pslldq m0, 2
%endif
dec cntd
jg .loop
%if ARCH_X86_32
UNSCRATCH 5, 12, rsp+4*mmsize
UNSCRATCH 4, 11, rsp+3*mmsize
UNSCRATCH 3, 10, rsp+2*mmsize
UNSCRATCH 2, 9, rsp+1*mmsize
UNSCRATCH 1, 8, rsp+0*mmsize
mov dstq, dstm
mov cntd, 8
add dstq, strideq
.loop2:
mova [dstq+strideq*0+ 0], m4
mova [dstq+strideq*0+16], m3
mova [dstq+strideq*0+32], m2
mova [dstq+strideq*0+48], m1
mova [dstq+stride16q+ 0], m5
mova [dstq+stride16q+16], m4
mova [dstq+stride16q+32], m3
mova [dstq+stride16q+48], m2
lea dstq, [dstq+strideq*2]
PALIGNR m1, m2, 14, m6
PALIGNR m2, m3, 14, m6
PALIGNR m3, m4, 14, m6
PALIGNR m4, m5, 14, m6
PALIGNR m5, m0, 14, m6
pslldq m0, 2
dec cntd
jg .loop2
%endif
RET
%endmacro
INIT_XMM sse2
VR_FUNCS
INIT_XMM ssse3
VR_FUNCS
INIT_XMM avx
VR_FUNCS
%macro HU_FUNCS 1 ; stack_mem_for_32x32_32bit_function
cglobal vp9_ipred_hu_4x4_16, 3, 3, 3, dst, stride, l, a
movh m0, [lq] ; abcd
%if cpuflag(ssse3)
pshufb m0, [pb_0to7_67x4] ; abcddddd
%else
punpcklqdq m0, m0
pshufhw m0, m0, q3333 ; abcddddd
%endif
psrldq m1, m0, 2 ; bcddddd.
psrldq m2, m0, 4 ; cddddd..
LOWPASS 2, 1, 0 ; BCDddd..
pavgw m1, m0 ; abcddddd
SBUTTERFLY wd, 1, 2, 0 ; aBbCcDdd, dddddddd
PALIGNR m2, m1, 4, m0 ; bCcDdddd
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
movh [dstq+strideq*0], m1 ; aBbC
movh [dstq+strideq*1], m2 ; bCcD
movhps [dstq+strideq*2], m1 ; cDdd
movhps [dstq+stride3q ], m2 ; dddd
RET
cglobal vp9_ipred_hu_8x8_16, 3, 3, 4, dst, stride, l, a
mova m0, [lq]
%if cpuflag(ssse3)
mova m3, [pb_2to15_14_15]
%endif
SHIFT_RIGHTx2 m1, m2, m0, m3
LOWPASS 2, 1, 0
pavgw m1, m0
SBUTTERFLY wd, 1, 2, 0
shufps m0, m1, m2, q1032
pshufd m3, m2, q3332
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
mova [dstq+strideq *0], m1
mova [dstq+strideq *2], m0
mova [dstq+strideq *4], m2
mova [dstq+stride3q*2], m3
add dstq, strideq
%if cpuflag(avx)
vpalignr m1, m2, m1, 4
%else
PALIGNR m0, m2, m1, 4, m3
mova m1, m0
%endif
pshufd m2, m2, q3321
shufps m0, m1, m2, q1032
pshufd m3, m2, q3332
mova [dstq+strideq *0], m1
mova [dstq+strideq *2], m0
mova [dstq+strideq *4], m2
mova [dstq+stride3q*2], m3
RET
cglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a
mova m0, [lq]
mova m3, [lq+mmsize]
movu m1, [lq+2]
movu m2, [lq+4]
LOWPASS 2, 1, 0
pavgw m1, m0
SBUTTERFLY wd, 1, 2, 0
%if cpuflag(ssse3)
mova m5, [pb_2to15_14_15]
%endif
SHIFT_RIGHTx2 m0, m4, m3, m5
LOWPASS 4, 0, 3
pavgw m3, m0
SBUTTERFLY wd, 3, 4, 5
pshufd m0, m0, q3333
DEFINE_ARGS dst, stride, stride3, cnt
lea stride3q, [strideq*3]
mov cntd, 4
.loop:
mova [dstq+strideq *0+ 0], m1
mova [dstq+strideq *0+16], m2
mova [dstq+strideq *4+ 0], m2
mova [dstq+strideq *4+16], m3
mova [dstq+strideq *8+ 0], m3
mova [dstq+strideq *8+16], m4
mova [dstq+stride3q*4+ 0], m4
mova [dstq+stride3q*4+16], m0
add dstq, strideq
%if cpuflag(avx)
vpalignr m1, m2, m1, 4
vpalignr m2, m3, m2, 4
vpalignr m3, m4, m3, 4
vpalignr m4, m0, m4, 4
%else
PALIGNR m5, m2, m1, 4, m6
mova m1, m5
PALIGNR m5, m3, m2, 4, m6
mova m2, m5
PALIGNR m5, m4, m3, 4, m6
mova m3, m5
PALIGNR m5, m0, m4, 4, m6
mova m4, m5
%endif
dec cntd
jg .loop
RET
cglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \
%1 * -mmsize * ARCH_X86_32, dst, stride, l, a
mova m2, [lq+mmsize*0+0]
movu m1, [lq+mmsize*0+2]
movu m0, [lq+mmsize*0+4]
LOWPASS 0, 1, 2
pavgw m1, m2
SBUTTERFLY wd, 1, 0, 2
SCRATCH 1, 8, rsp+0*mmsize
mova m4, [lq+mmsize*1+0]
movu m3, [lq+mmsize*1+2]
movu m2, [lq+mmsize*1+4]
LOWPASS 2, 3, 4
pavgw m3, m4
SBUTTERFLY wd, 3, 2, 4
mova m6, [lq+mmsize*2+0]
movu m5, [lq+mmsize*2+2]
movu m4, [lq+mmsize*2+4]
LOWPASS 4, 5, 6
pavgw m5, m6
SBUTTERFLY wd, 5, 4, 6
mova m7, [lq+mmsize*3+0]
SCRATCH 0, 9, rsp+1*mmsize
%if cpuflag(ssse3)
mova m0, [pb_2to15_14_15]
%endif
SHIFT_RIGHTx2 m1, m6, m7, m0
LOWPASS 6, 1, 7
pavgw m7, m1
SBUTTERFLY wd, 7, 6, 0
pshufd m1, m1, q3333
UNSCRATCH 0, 9, rsp+1*mmsize
DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
lea stride3q, [strideq*3]
lea stride4q, [strideq*4]
lea stride28q, [stride4q*8]
lea stride20q, [stride4q*5]
sub stride28q, stride4q
mov cntd, 4
.loop:
%if ARCH_X86_64
SWAP 1, 8
%else
mova [rsp+1*mmsize], m1
mova m1, [rsp+0*mmsize]
%endif
mova [dstq+strideq *0+ 0], m1
mova [dstq+strideq *0+16], m0
mova [dstq+strideq *0+32], m3
mova [dstq+strideq *0+48], m2
mova [dstq+stride4q*1+ 0], m0
mova [dstq+stride4q*1+16], m3
mova [dstq+stride4q*1+32], m2
mova [dstq+stride4q*1+48], m5
mova [dstq+stride4q*2+ 0], m3
mova [dstq+stride4q*2+16], m2
mova [dstq+stride4q*2+32], m5
mova [dstq+stride4q*2+48], m4
%if cpuflag(avx)
vpalignr m1, m0, m1, 4
vpalignr m0, m3, m0, 4
vpalignr m3, m2, m3, 4
%else
SCRATCH 6, 9, rsp+2*mmsize
%if notcpuflag(ssse3)
SCRATCH 7, 10, rsp+3*mmsize
%endif
PALIGNR m6, m0, m1, 4, m7
mova m1, m6
PALIGNR m6, m3, m0, 4, m7
mova m0, m6
PALIGNR m6, m2, m3, 4, m7
mova m3, m6
UNSCRATCH 6, 9, rsp+2*mmsize
SCRATCH 0, 9, rsp+2*mmsize
%if notcpuflag(ssse3)
UNSCRATCH 7, 10, rsp+3*mmsize
SCRATCH 3, 10, rsp+3*mmsize
%endif
%endif
%if ARCH_X86_64
SWAP 1, 8
%else
mova [rsp+0*mmsize], m1
mova m1, [rsp+1*mmsize]
%endif
mova [dstq+stride3q*4+ 0], m2
mova [dstq+stride3q*4+16], m5
mova [dstq+stride3q*4+32], m4
mova [dstq+stride3q*4+48], m7
mova [dstq+stride4q*4+ 0], m5
mova [dstq+stride4q*4+16], m4
mova [dstq+stride4q*4+32], m7
mova [dstq+stride4q*4+48], m6
mova [dstq+stride20q + 0], m4
mova [dstq+stride20q +16], m7
mova [dstq+stride20q +32], m6
mova [dstq+stride20q +48], m1
mova [dstq+stride3q*8+ 0], m7
mova [dstq+stride3q*8+16], m6
mova [dstq+stride3q*8+32], m1
mova [dstq+stride3q*8+48], m1
mova [dstq+stride28q + 0], m6
mova [dstq+stride28q +16], m1
mova [dstq+stride28q +32], m1
mova [dstq+stride28q +48], m1
%if cpuflag(avx)
vpalignr m2, m5, m2, 4
vpalignr m5, m4, m5, 4
vpalignr m4, m7, m4, 4
vpalignr m7, m6, m7, 4
vpalignr m6, m1, m6, 4
%else
PALIGNR m0, m5, m2, 4, m3
mova m2, m0
PALIGNR m0, m4, m5, 4, m3
mova m5, m0
PALIGNR m0, m7, m4, 4, m3
mova m4, m0
PALIGNR m0, m6, m7, 4, m3
mova m7, m0
PALIGNR m0, m1, m6, 4, m3
mova m6, m0
UNSCRATCH 0, 9, rsp+2*mmsize
%if notcpuflag(ssse3)
UNSCRATCH 3, 10, rsp+3*mmsize
%endif
%endif
add dstq, strideq
dec cntd
jg .loop
RET
%endmacro
INIT_XMM sse2
HU_FUNCS 4
INIT_XMM ssse3
HU_FUNCS 3
INIT_XMM avx
HU_FUNCS 2
%macro HD_FUNCS 0
cglobal vp9_ipred_hd_4x4_16, 4, 4, 4, dst, stride, l, a
movh m0, [lq]
movhps m0, [aq-2]
psrldq m1, m0, 2
psrldq m2, m0, 4
LOWPASS 2, 1, 0
pavgw m1, m0
punpcklwd m1, m2
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
movh [dstq+stride3q ], m1
movhps [dstq+strideq*1], m1
movhlps m2, m2
PALIGNR m2, m1, 4, m0
movh [dstq+strideq*2], m2
movhps [dstq+strideq*0], m2
RET
cglobal vp9_ipred_hd_8x8_16, 4, 4, 5, dst, stride, l, a
mova m0, [lq]
movu m1, [aq-2]
PALIGNR m2, m1, m0, 2, m3
PALIGNR m3, m1, m0, 4, m4
LOWPASS 3, 2, 0
pavgw m2, m0
SBUTTERFLY wd, 2, 3, 0
psrldq m0, m1, 2
psrldq m4, m1, 4
LOWPASS 1, 0, 4
DEFINE_ARGS dst8, mstride, cnt
lea dst8q, [dst8q+mstrideq*8]
neg mstrideq
mov cntd, 4
.loop:
add dst8q, mstrideq
mova [dst8q+mstrideq*0], m2
mova [dst8q+mstrideq*4], m3
%if cpuflag(avx)
vpalignr m2, m3, m2, 4
vpalignr m3, m1, m3, 4
%else
PALIGNR m0, m3, m2, 4, m4
mova m2, m0
PALIGNR m0, m1, m3, 4, m4
mova m3, m0
%endif
psrldq m1, 4
dec cntd
jg .loop
RET
cglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a
mova m2, [lq]
movu m1, [lq+2]
movu m0, [lq+4]
LOWPASS 0, 1, 2
pavgw m1, m2
mova m4, [lq+mmsize]
movu m5, [aq-2]
PALIGNR m3, m5, m4, 2, m6
PALIGNR m2, m5, m4, 4, m6
LOWPASS 2, 3, 4
pavgw m3, m4
SBUTTERFLY wd, 1, 0, 4
SBUTTERFLY wd, 3, 2, 4
mova m6, [aq]
movu m4, [aq+2]
LOWPASS 4, 6, 5
movu m5, [aq+mmsize-2]
psrldq m6, m5, 2
psrldq m7, m5, 4
LOWPASS 5, 6, 7
DEFINE_ARGS dst, mstride, mstride3, cnt
lea dstq, [dstq+mstrideq*8]
lea dstq, [dstq+mstrideq*8]
neg mstrideq
lea mstride3q, [mstrideq*3]
mov cntd, 4
.loop:
add dstq, mstrideq
mova [dstq+mstride3q*4+ 0], m2
mova [dstq+mstride3q*4+16], m4
mova [dstq+mstrideq *8+ 0], m3
mova [dstq+mstrideq *8+16], m2
mova [dstq+mstrideq *4+ 0], m0
mova [dstq+mstrideq *4+16], m3
mova [dstq+mstrideq *0+ 0], m1
mova [dstq+mstrideq *0+16], m0
%if cpuflag(avx)
vpalignr m1, m0, m1, 4
vpalignr m0, m3, m0, 4
vpalignr m3, m2, m3, 4
vpalignr m2, m4, m2, 4
vpalignr m4, m5, m4, 4
%else
PALIGNR m6, m0, m1, 4, m7
mova m1, m6
PALIGNR m6, m3, m0, 4, m7
mova m0, m6
PALIGNR m6, m2, m3, 4, m7
mova m3, m6
PALIGNR m6, m4, m2, 4, m7
mova m2, m6
PALIGNR m6, m5, m4, 4, m7
mova m4, m6
%endif
psrldq m5, 4
dec cntd
jg .loop
RET
cglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \
10 * -mmsize * ARCH_X86_32, dst, stride, l, a
mova m2, [lq+mmsize*0+0]
movu m1, [lq+mmsize*0+2]
movu m0, [lq+mmsize*0+4]
LOWPASS 0, 1, 2
pavgw m1, m2
SBUTTERFLY wd, 1, 0, 2
mova m4, [lq+mmsize*1+0]
movu m3, [lq+mmsize*1+2]
movu m2, [lq+mmsize*1+4]
LOWPASS 2, 3, 4
pavgw m3, m4
SBUTTERFLY wd, 3, 2, 4
SCRATCH 0, 8, rsp+0*mmsize
SCRATCH 1, 9, rsp+1*mmsize
SCRATCH 2, 10, rsp+2*mmsize
SCRATCH 3, 11, rsp+3*mmsize
mova m6, [lq+mmsize*2+0]
movu m5, [lq+mmsize*2+2]
movu m4, [lq+mmsize*2+4]
LOWPASS 4, 5, 6
pavgw m5, m6
SBUTTERFLY wd, 5, 4, 6
mova m0, [lq+mmsize*3+0]
movu m1, [aq+mmsize*0-2]
PALIGNR m7, m1, m0, 2, m2
PALIGNR m6, m1, m0, 4, m2
LOWPASS 6, 7, 0
pavgw m7, m0
SBUTTERFLY wd, 7, 6, 0
mova m2, [aq+mmsize*0+0]
movu m0, [aq+mmsize*0+2]
LOWPASS 0, 2, 1
movu m1, [aq+mmsize*1-2]
mova m2, [aq+mmsize*1+0]
movu m3, [aq+mmsize*1+2]
LOWPASS 1, 2, 3
SCRATCH 6, 12, rsp+6*mmsize
SCRATCH 7, 13, rsp+7*mmsize
movu m2, [aq+mmsize*2-2]
mova m3, [aq+mmsize*2+0]
movu m6, [aq+mmsize*2+2]
LOWPASS 2, 3, 6
movu m3, [aq+mmsize*3-2]
psrldq m6, m3, 2
psrldq m7, m3, 4
LOWPASS 3, 6, 7
UNSCRATCH 6, 12, rsp+6*mmsize
UNSCRATCH 7, 13, rsp+7*mmsize
%if ARCH_X86_32
mova [rsp+4*mmsize], m4
mova [rsp+5*mmsize], m5
; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need
; to do it again here
%endif
DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
mov cntd, 4
lea stride3q, [strideq*3]
%if ARCH_X86_64
lea stride4q, [strideq*4]
lea stride28q, [stride4q*8]
lea stride20q, [stride4q*5]
sub stride28q, stride4q
%endif
add dstq, stride3q
; x86-32 doesn't have enough registers, so on that platform, we split
; the loop in 2... Otherwise you spend most of the loop (un)scratching
.loop:
%if ARCH_X86_64
mova [dstq+stride28q + 0], m9
mova [dstq+stride28q +16], m8
mova [dstq+stride28q +32], m11
mova [dstq+stride28q +48], m10
mova [dstq+stride3q*8+ 0], m8
mova [dstq+stride3q*8+16], m11
mova [dstq+stride3q*8+32], m10
mova [dstq+stride3q*8+48], m5
mova [dstq+stride20q + 0], m11
mova [dstq+stride20q +16], m10
mova [dstq+stride20q +32], m5
mova [dstq+stride20q +48], m4
mova [dstq+stride4q*4+ 0], m10
mova [dstq+stride4q*4+16], m5
mova [dstq+stride4q*4+32], m4
mova [dstq+stride4q*4+48], m7
%endif
mova [dstq+stride3q*4+ 0], m5
mova [dstq+stride3q*4+16], m4
mova [dstq+stride3q*4+32], m7
mova [dstq+stride3q*4+48], m6
mova [dstq+strideq* 8+ 0], m4
mova [dstq+strideq* 8+16], m7
mova [dstq+strideq* 8+32], m6
mova [dstq+strideq* 8+48], m0
mova [dstq+strideq* 4+ 0], m7
mova [dstq+strideq* 4+16], m6
mova [dstq+strideq* 4+32], m0
mova [dstq+strideq* 4+48], m1
mova [dstq+strideq* 0+ 0], m6
mova [dstq+strideq* 0+16], m0
mova [dstq+strideq* 0+32], m1
mova [dstq+strideq* 0+48], m2
sub dstq, strideq
%if cpuflag(avx)
%if ARCH_X86_64
vpalignr m9, m8, m9, 4
vpalignr m8, m11, m8, 4
vpalignr m11, m10, m11, 4
vpalignr m10, m5, m10, 4
%endif
vpalignr m5, m4, m5, 4
vpalignr m4, m7, m4, 4
vpalignr m7, m6, m7, 4
vpalignr m6, m0, m6, 4
vpalignr m0, m1, m0, 4
vpalignr m1, m2, m1, 4
vpalignr m2, m3, m2, 4
%else
%if ARCH_X86_64
PALIGNR m12, m8, m9, 4, m13
mova m9, m12
PALIGNR m12, m11, m8, 4, m13
mova m8, m12
PALIGNR m12, m10, m11, 4, m13
mova m11, m12
PALIGNR m12, m5, m10, 4, m13
mova m10, m12
%endif
SCRATCH 3, 12, rsp+8*mmsize, sh
%if notcpuflag(ssse3)
SCRATCH 2, 13, rsp+9*mmsize
%endif
PALIGNR m3, m4, m5, 4, m2
mova m5, m3
PALIGNR m3, m7, m4, 4, m2
mova m4, m3
PALIGNR m3, m6, m7, 4, m2
mova m7, m3
PALIGNR m3, m0, m6, 4, m2
mova m6, m3
PALIGNR m3, m1, m0, 4, m2
mova m0, m3
%if notcpuflag(ssse3)
UNSCRATCH 2, 13, rsp+9*mmsize
SCRATCH 0, 13, rsp+9*mmsize
%endif
PALIGNR m3, m2, m1, 4, m0
mova m1, m3
PALIGNR m3, reg_sh, m2, 4, m0
mova m2, m3
%if notcpuflag(ssse3)
UNSCRATCH 0, 13, rsp+9*mmsize
%endif
UNSCRATCH 3, 12, rsp+8*mmsize, sh
%endif
psrldq m3, 4
dec cntd
jg .loop
%if ARCH_X86_32
UNSCRATCH 0, 8, rsp+0*mmsize
UNSCRATCH 1, 9, rsp+1*mmsize
UNSCRATCH 2, 10, rsp+2*mmsize
UNSCRATCH 3, 11, rsp+3*mmsize
mova m4, [rsp+4*mmsize]
mova m5, [rsp+5*mmsize]
mova m6, [rsp+6*mmsize]
mova m7, [rsp+7*mmsize]
DEFINE_ARGS dst, stride, stride5, stride3
lea stride5q, [strideq*5]
lea dstq, [dstq+stride5q*4]
DEFINE_ARGS dst, stride, cnt, stride3
mov cntd, 4
.loop_2:
mova [dstq+stride3q*4+ 0], m1
mova [dstq+stride3q*4+16], m0
mova [dstq+stride3q*4+32], m3
mova [dstq+stride3q*4+48], m2
mova [dstq+strideq* 8+ 0], m0
mova [dstq+strideq* 8+16], m3
mova [dstq+strideq* 8+32], m2
mova [dstq+strideq* 8+48], m5
mova [dstq+strideq* 4+ 0], m3
mova [dstq+strideq* 4+16], m2
mova [dstq+strideq* 4+32], m5
mova [dstq+strideq* 4+48], m4
mova [dstq+strideq* 0+ 0], m2
mova [dstq+strideq* 0+16], m5
mova [dstq+strideq* 0+32], m4
mova [dstq+strideq* 0+48], m7
sub dstq, strideq
%if cpuflag(avx)
vpalignr m1, m0, m1, 4
vpalignr m0, m3, m0, 4
vpalignr m3, m2, m3, 4
vpalignr m2, m5, m2, 4
vpalignr m5, m4, m5, 4
vpalignr m4, m7, m4, 4
vpalignr m7, m6, m7, 4
%else
SCRATCH 6, 12, rsp+8*mmsize, sh
%if notcpuflag(ssse3)
SCRATCH 7, 13, rsp+9*mmsize
%endif
PALIGNR m6, m0, m1, 4, m7
mova m1, m6
PALIGNR m6, m3, m0, 4, m7
mova m0, m6
PALIGNR m6, m2, m3, 4, m7
mova m3, m6
PALIGNR m6, m5, m2, 4, m7
mova m2, m6
PALIGNR m6, m4, m5, 4, m7
mova m5, m6
%if notcpuflag(ssse3)
UNSCRATCH 7, 13, rsp+9*mmsize
SCRATCH 5, 13, rsp+9*mmsize
%endif
PALIGNR m6, m7, m4, 4, m5
mova m4, m6
PALIGNR m6, reg_sh, m7, 4, m5
mova m7, m6
%if notcpuflag(ssse3)
UNSCRATCH 5, 13, rsp+9*mmsize
%endif
UNSCRATCH 6, 12, rsp+8*mmsize, sh
%endif
psrldq m6, 4
dec cntd
jg .loop_2
%endif
RET
%endmacro
INIT_XMM sse2
HD_FUNCS
INIT_XMM ssse3
HD_FUNCS
INIT_XMM avx
HD_FUNCS