vpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
Alex Converse 0c00af126d Add vpx_highbd_convolve_{copy,avg}_sse2
single-threaded:
swanky (silvermont): ~1% faster overall
peppy (celeron,haswell): ~1.5% faster overall

Change-Id: Ib74f014374c63c9eaf2d38191cbd8e2edcc52073
2015-10-09 11:50:25 -07:00

219 lines
6.2 KiB
NASM

;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "third_party/x86inc/x86inc.asm"
SECTION .text
%macro convolve_fn 1-2
INIT_XMM sse2
%ifidn %2, highbd
%define pavg pavgw
cglobal %2_convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
fx, fxs, fy, fys, w, h, bd
%else
%define pavg pavgb
cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
fx, fxs, fy, fys, w, h
%endif
mov r4d, dword wm
%ifidn %2, highbd
shl r4d, 1
shl srcq, 1
shl src_strideq, 1
shl dstq, 1
shl dst_strideq, 1
%else
cmp r4d, 4
je .w4
%endif
cmp r4d, 8
je .w8
cmp r4d, 16
je .w16
cmp r4d, 32
je .w32
%ifidn %2, highbd
cmp r4d, 64
je .w64
mov r4d, dword hm
.loop128:
movu m0, [srcq]
movu m1, [srcq+16]
movu m2, [srcq+32]
movu m3, [srcq+48]
%ifidn %1, avg
pavg m0, [dstq]
pavg m1, [dstq+16]
pavg m2, [dstq+32]
pavg m3, [dstq+48]
%endif
mova [dstq ], m0
mova [dstq+16], m1
mova [dstq+32], m2
mova [dstq+48], m3
movu m0, [srcq+64]
movu m1, [srcq+80]
movu m2, [srcq+96]
movu m3, [srcq+112]
add srcq, src_strideq
%ifidn %1, avg
pavg m0, [dstq+64]
pavg m1, [dstq+80]
pavg m2, [dstq+96]
pavg m3, [dstq+112]
%endif
mova [dstq+64], m0
mova [dstq+80], m1
mova [dstq+96], m2
mova [dstq+112], m3
add dstq, dst_strideq
dec r4d
jnz .loop128
RET
%endif
.w64
mov r4d, dword hm
.loop64:
movu m0, [srcq]
movu m1, [srcq+16]
movu m2, [srcq+32]
movu m3, [srcq+48]
add srcq, src_strideq
%ifidn %1, avg
pavg m0, [dstq]
pavg m1, [dstq+16]
pavg m2, [dstq+32]
pavg m3, [dstq+48]
%endif
mova [dstq ], m0
mova [dstq+16], m1
mova [dstq+32], m2
mova [dstq+48], m3
add dstq, dst_strideq
dec r4d
jnz .loop64
RET
.w32:
mov r4d, dword hm
.loop32:
movu m0, [srcq]
movu m1, [srcq+16]
movu m2, [srcq+src_strideq]
movu m3, [srcq+src_strideq+16]
lea srcq, [srcq+src_strideq*2]
%ifidn %1, avg
pavg m0, [dstq]
pavg m1, [dstq +16]
pavg m2, [dstq+dst_strideq]
pavg m3, [dstq+dst_strideq+16]
%endif
mova [dstq ], m0
mova [dstq +16], m1
mova [dstq+dst_strideq ], m2
mova [dstq+dst_strideq+16], m3
lea dstq, [dstq+dst_strideq*2]
sub r4d, 2
jnz .loop32
RET
.w16:
mov r4d, dword hm
lea r5q, [src_strideq*3]
lea r6q, [dst_strideq*3]
.loop16:
movu m0, [srcq]
movu m1, [srcq+src_strideq]
movu m2, [srcq+src_strideq*2]
movu m3, [srcq+r5q]
lea srcq, [srcq+src_strideq*4]
%ifidn %1, avg
pavg m0, [dstq]
pavg m1, [dstq+dst_strideq]
pavg m2, [dstq+dst_strideq*2]
pavg m3, [dstq+r6q]
%endif
mova [dstq ], m0
mova [dstq+dst_strideq ], m1
mova [dstq+dst_strideq*2], m2
mova [dstq+r6q ], m3
lea dstq, [dstq+dst_strideq*4]
sub r4d, 4
jnz .loop16
RET
INIT_MMX sse
.w8:
mov r4d, dword hm
lea r5q, [src_strideq*3]
lea r6q, [dst_strideq*3]
.loop8:
movu m0, [srcq]
movu m1, [srcq+src_strideq]
movu m2, [srcq+src_strideq*2]
movu m3, [srcq+r5q]
lea srcq, [srcq+src_strideq*4]
%ifidn %1, avg
pavg m0, [dstq]
pavg m1, [dstq+dst_strideq]
pavg m2, [dstq+dst_strideq*2]
pavg m3, [dstq+r6q]
%endif
mova [dstq ], m0
mova [dstq+dst_strideq ], m1
mova [dstq+dst_strideq*2], m2
mova [dstq+r6q ], m3
lea dstq, [dstq+dst_strideq*4]
sub r4d, 4
jnz .loop8
RET
%ifnidn %2, highbd
.w4:
mov r4d, dword hm
lea r5q, [src_strideq*3]
lea r6q, [dst_strideq*3]
.loop4:
movh m0, [srcq]
movh m1, [srcq+src_strideq]
movh m2, [srcq+src_strideq*2]
movh m3, [srcq+r5q]
lea srcq, [srcq+src_strideq*4]
%ifidn %1, avg
movh m4, [dstq]
movh m5, [dstq+dst_strideq]
movh m6, [dstq+dst_strideq*2]
movh m7, [dstq+r6q]
pavg m0, m4
pavg m1, m5
pavg m2, m6
pavg m3, m7
%endif
movh [dstq ], m0
movh [dstq+dst_strideq ], m1
movh [dstq+dst_strideq*2], m2
movh [dstq+r6q ], m3
lea dstq, [dstq+dst_strideq*4]
sub r4d, 4
jnz .loop4
RET
%endif
%endmacro
convolve_fn copy
convolve_fn avg
%if CONFIG_VP9_HIGHBITDEPTH
convolve_fn copy, highbd
convolve_fn avg, highbd
%endif