vpx/vp9/encoder/x86/vp9_subpel_variance.asm
Johann eb88b172fe Make vp9 subpixel match vp8
The only difference between the two was that the vp9 function allowed
for every step in the bilinear filter (16 steps) while vp8 only allowed
for half of those. Since all the call sites in vp9 (<< 1) the input, it
only ever used the same steps as vp8.

This will allow moving the subpel variance to vpx_dsp with the rest of
the variance functions.

Change-Id: I6fa2509350a2dc610c46b3e15bde98a15a084b75
2015-06-03 22:10:51 -07:00

1397 lines
41 KiB
NASM

;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "third_party/x86inc/x86inc.asm"
SECTION_RODATA
pw_8: times 8 dw 8
bilin_filter_m_sse2: times 8 dw 16
times 8 dw 0
times 8 dw 14
times 8 dw 2
times 8 dw 12
times 8 dw 4
times 8 dw 10
times 8 dw 6
times 16 dw 8
times 8 dw 6
times 8 dw 10
times 8 dw 4
times 8 dw 12
times 8 dw 2
times 8 dw 14
bilin_filter_m_ssse3: times 8 db 16, 0
times 8 db 14, 2
times 8 db 12, 4
times 8 db 10, 6
times 16 db 8
times 8 db 6, 10
times 8 db 4, 12
times 8 db 2, 14
SECTION .text
; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
; int x_offset, int y_offset,
; const uint8_t *dst, ptrdiff_t dst_stride,
; int height, unsigned int *sse);
;
; This function returns the SE and stores SSE in the given pointer.
%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
psubw %3, %4
psubw %1, %2
paddw %5, %3
pmaddwd %3, %3
paddw %5, %1
pmaddwd %1, %1
paddd %6, %3
paddd %6, %1
%endmacro
%macro STORE_AND_RET 0
%if mmsize == 16
; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
; We have to sign-extend it before adding the words within the register
; and outputing to a dword.
pcmpgtw m5, m6 ; mask for 0 > x
movhlps m3, m7
punpcklwd m4, m6, m5
punpckhwd m6, m5 ; sign-extend m6 word->dword
paddd m7, m3
paddd m6, m4
pshufd m3, m7, 0x1
movhlps m4, m6
paddd m7, m3
paddd m6, m4
mov r1, ssem ; r1 = unsigned int *sse
pshufd m4, m6, 0x1
movd [r1], m7 ; store sse
paddd m6, m4
movd raxd, m6 ; store sum as return value
%else ; mmsize == 8
pshufw m4, m6, 0xe
pshufw m3, m7, 0xe
paddw m6, m4
paddd m7, m3
pcmpgtw m5, m6 ; mask for 0 > x
mov r1, ssem ; r1 = unsigned int *sse
punpcklwd m6, m5 ; sign-extend m6 word->dword
movd [r1], m7 ; store sse
pshufw m4, m6, 0xe
paddd m6, m4
movd raxd, m6 ; store sum as return value
%endif
RET
%endmacro
%macro INC_SRC_BY_SRC_STRIDE 0
%if ARCH_X86=1 && CONFIG_PIC=1
add srcq, src_stridemp
%else
add srcq, src_strideq
%endif
%endmacro
%macro SUBPEL_VARIANCE 1-2 0 ; W
%if cpuflag(ssse3)
%define bilin_filter_m bilin_filter_m_ssse3
%define filter_idx_shift 4
%else
%define bilin_filter_m bilin_filter_m_sse2
%define filter_idx_shift 5
%endif
; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
; 11, not 13, if the registers are ordered correctly. May make a minor speed
; difference on Win64
%ifdef PIC ; 64bit PIC
%if %2 == 1 ; avg
cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
x_offset, y_offset, \
dst, dst_stride, \
sec, sec_stride, height, sse
%define sec_str sec_strideq
%else
cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
y_offset, dst, dst_stride, height, sse
%endif
%define h heightd
%define bilin_filter sseq
%else
%if ARCH_X86=1 && CONFIG_PIC=1
%if %2 == 1 ; avg
cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
x_offset, y_offset, \
dst, dst_stride, \
sec, sec_stride, \
height, sse, g_bilin_filter, g_pw_8
%define h dword heightm
%define sec_str sec_stridemp
;Store bilin_filter and pw_8 location in stack
GET_GOT eax
add esp, 4 ; restore esp
lea ecx, [GLOBAL(bilin_filter_m)]
mov g_bilin_filterm, ecx
lea ecx, [GLOBAL(pw_8)]
mov g_pw_8m, ecx
LOAD_IF_USED 0, 1 ; load eax, ecx back
%else
cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
y_offset, dst, dst_stride, height, sse, \
g_bilin_filter, g_pw_8
%define h heightd
;Store bilin_filter and pw_8 location in stack
GET_GOT eax
add esp, 4 ; restore esp
lea ecx, [GLOBAL(bilin_filter_m)]
mov g_bilin_filterm, ecx
lea ecx, [GLOBAL(pw_8)]
mov g_pw_8m, ecx
LOAD_IF_USED 0, 1 ; load eax, ecx back
%endif
%else
%if %2 == 1 ; avg
cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
7 + 2 * ARCH_X86_64, 13, src, src_stride, \
x_offset, y_offset, \
dst, dst_stride, \
sec, sec_stride, \
height, sse
%if ARCH_X86_64
%define h heightd
%define sec_str sec_strideq
%else
%define h dword heightm
%define sec_str sec_stridemp
%endif
%else
cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
y_offset, dst, dst_stride, height, sse
%define h heightd
%endif
%define bilin_filter bilin_filter_m
%endif
%endif
ASSERT %1 <= 16 ; m6 overflows if w > 16
pxor m6, m6 ; sum
pxor m7, m7 ; sse
; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
; could perhaps use it for something more productive then
pxor m5, m5 ; dedicated zero register
%if %1 < 16
sar h, 1
%if %2 == 1 ; avg
shl sec_str, 1
%endif
%endif
; FIXME(rbultje) replace by jumptable?
test x_offsetd, x_offsetd
jnz .x_nonzero
; x_offset == 0
test y_offsetd, y_offsetd
jnz .x_zero_y_nonzero
; x_offset == 0 && y_offset == 0
.x_zero_y_zero_loop:
%if %1 == 16
movu m0, [srcq]
mova m1, [dstq]
%if %2 == 1 ; avg
pavgb m0, [secq]
punpckhbw m3, m1, m5
punpcklbw m1, m5
%endif
punpckhbw m2, m0, m5
punpcklbw m0, m5
%if %2 == 0 ; !avg
punpckhbw m3, m1, m5
punpcklbw m1, m5
%endif
SUM_SSE m0, m1, m2, m3, m6, m7
add srcq, src_strideq
add dstq, dst_strideq
%else ; %1 < 16
movh m0, [srcq]
%if %2 == 1 ; avg
%if mmsize == 16
movhps m0, [srcq+src_strideq]
%else ; mmsize == 8
punpckldq m0, [srcq+src_strideq]
%endif
%else ; !avg
movh m2, [srcq+src_strideq]
%endif
movh m1, [dstq]
movh m3, [dstq+dst_strideq]
%if %2 == 1 ; avg
pavgb m0, [secq]
punpcklbw m3, m5
punpcklbw m1, m5
punpckhbw m2, m0, m5
punpcklbw m0, m5
%else ; !avg
punpcklbw m0, m5
punpcklbw m2, m5
punpcklbw m3, m5
punpcklbw m1, m5
%endif
SUM_SSE m0, m1, m2, m3, m6, m7
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
%endif
%if %2 == 1 ; avg
add secq, sec_str
%endif
dec h
jg .x_zero_y_zero_loop
STORE_AND_RET
.x_zero_y_nonzero:
cmp y_offsetd, 8
jne .x_zero_y_nonhalf
; x_offset == 0 && y_offset == 0.5
.x_zero_y_half_loop:
%if %1 == 16
movu m0, [srcq]
movu m4, [srcq+src_strideq]
mova m1, [dstq]
pavgb m0, m4
punpckhbw m3, m1, m5
%if %2 == 1 ; avg
pavgb m0, [secq]
%endif
punpcklbw m1, m5
punpckhbw m2, m0, m5
punpcklbw m0, m5
SUM_SSE m0, m1, m2, m3, m6, m7
add srcq, src_strideq
add dstq, dst_strideq
%else ; %1 < 16
movh m0, [srcq]
movh m2, [srcq+src_strideq]
%if %2 == 1 ; avg
%if mmsize == 16
movhps m2, [srcq+src_strideq*2]
%else ; mmsize == 8
%if %1 == 4
movh m1, [srcq+src_strideq*2]
punpckldq m2, m1
%else
punpckldq m2, [srcq+src_strideq*2]
%endif
%endif
movh m1, [dstq]
%if mmsize == 16
movlhps m0, m2
%else ; mmsize == 8
punpckldq m0, m2
%endif
movh m3, [dstq+dst_strideq]
pavgb m0, m2
punpcklbw m1, m5
pavgb m0, [secq]
punpcklbw m3, m5
punpckhbw m2, m0, m5
punpcklbw m0, m5
%else ; !avg
movh m4, [srcq+src_strideq*2]
movh m1, [dstq]
pavgb m0, m2
movh m3, [dstq+dst_strideq]
pavgb m2, m4
punpcklbw m0, m5
punpcklbw m2, m5
punpcklbw m3, m5
punpcklbw m1, m5
%endif
SUM_SSE m0, m1, m2, m3, m6, m7
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
%endif
%if %2 == 1 ; avg
add secq, sec_str
%endif
dec h
jg .x_zero_y_half_loop
STORE_AND_RET
.x_zero_y_nonhalf:
; x_offset == 0 && y_offset == bilin interpolation
%ifdef PIC
lea bilin_filter, [bilin_filter_m]
%endif
shl y_offsetd, filter_idx_shift
%if ARCH_X86_64 && mmsize == 16
mova m8, [bilin_filter+y_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+y_offsetq+16]
%endif
mova m10, [pw_8]
%define filter_y_a m8
%define filter_y_b m9
%define filter_rnd m10
%else ; x86-32 or mmx
%if ARCH_X86=1 && CONFIG_PIC=1
; x_offset == 0, reuse x_offset reg
%define tempq x_offsetq
add y_offsetq, g_bilin_filterm
%define filter_y_a [y_offsetq]
%define filter_y_b [y_offsetq+16]
mov tempq, g_pw_8m
%define filter_rnd [tempq]
%else
add y_offsetq, bilin_filter
%define filter_y_a [y_offsetq]
%define filter_y_b [y_offsetq+16]
%define filter_rnd [pw_8]
%endif
%endif
.x_zero_y_other_loop:
%if %1 == 16
movu m0, [srcq]
movu m4, [srcq+src_strideq]
mova m1, [dstq]
%if cpuflag(ssse3)
punpckhbw m2, m0, m4
punpcklbw m0, m4
pmaddubsw m2, filter_y_a
pmaddubsw m0, filter_y_a
paddw m2, filter_rnd
paddw m0, filter_rnd
%else
punpckhbw m2, m0, m5
punpckhbw m3, m4, m5
punpcklbw m0, m5
punpcklbw m4, m5
; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
; instructions is the same (5), but it is 1 mul instead of 2, so might be
; slightly faster because of pmullw latency. It would also cut our rodata
; tables in half for this function, and save 1-2 registers on x86-64.
pmullw m2, filter_y_a
pmullw m3, filter_y_b
paddw m2, filter_rnd
pmullw m0, filter_y_a
pmullw m4, filter_y_b
paddw m0, filter_rnd
paddw m2, m3
paddw m0, m4
%endif
psraw m2, 4
psraw m0, 4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
packuswb m0, m2
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
%endif
punpckhbw m3, m1, m5
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
add srcq, src_strideq
add dstq, dst_strideq
%else ; %1 < 16
movh m0, [srcq]
movh m2, [srcq+src_strideq]
movh m4, [srcq+src_strideq*2]
movh m3, [dstq+dst_strideq]
%if cpuflag(ssse3)
movh m1, [dstq]
punpcklbw m0, m2
punpcklbw m2, m4
pmaddubsw m0, filter_y_a
pmaddubsw m2, filter_y_a
punpcklbw m3, m5
paddw m2, filter_rnd
paddw m0, filter_rnd
%else
punpcklbw m0, m5
punpcklbw m2, m5
punpcklbw m4, m5
pmullw m0, filter_y_a
pmullw m1, m2, filter_y_b
punpcklbw m3, m5
paddw m0, filter_rnd
pmullw m2, filter_y_a
pmullw m4, filter_y_b
paddw m0, m1
paddw m2, filter_rnd
movh m1, [dstq]
paddw m2, m4
%endif
psraw m0, 4
psraw m2, 4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
packuswb m0, m2
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
%endif
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
%endif
%if %2 == 1 ; avg
add secq, sec_str
%endif
dec h
jg .x_zero_y_other_loop
%undef filter_y_a
%undef filter_y_b
%undef filter_rnd
STORE_AND_RET
.x_nonzero:
cmp x_offsetd, 8
jne .x_nonhalf
; x_offset == 0.5
test y_offsetd, y_offsetd
jnz .x_half_y_nonzero
; x_offset == 0.5 && y_offset == 0
.x_half_y_zero_loop:
%if %1 == 16
movu m0, [srcq]
movu m4, [srcq+1]
mova m1, [dstq]
pavgb m0, m4
punpckhbw m3, m1, m5
%if %2 == 1 ; avg
pavgb m0, [secq]
%endif
punpcklbw m1, m5
punpckhbw m2, m0, m5
punpcklbw m0, m5
SUM_SSE m0, m1, m2, m3, m6, m7
add srcq, src_strideq
add dstq, dst_strideq
%else ; %1 < 16
movh m0, [srcq]
movh m4, [srcq+1]
%if %2 == 1 ; avg
%if mmsize == 16
movhps m0, [srcq+src_strideq]
movhps m4, [srcq+src_strideq+1]
%else ; mmsize == 8
punpckldq m0, [srcq+src_strideq]
punpckldq m4, [srcq+src_strideq+1]
%endif
movh m1, [dstq]
movh m3, [dstq+dst_strideq]
pavgb m0, m4
punpcklbw m3, m5
pavgb m0, [secq]
punpcklbw m1, m5
punpckhbw m2, m0, m5
punpcklbw m0, m5
%else ; !avg
movh m2, [srcq+src_strideq]
movh m1, [dstq]
pavgb m0, m4
movh m4, [srcq+src_strideq+1]
movh m3, [dstq+dst_strideq]
pavgb m2, m4
punpcklbw m0, m5
punpcklbw m2, m5
punpcklbw m3, m5
punpcklbw m1, m5
%endif
SUM_SSE m0, m1, m2, m3, m6, m7
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
%endif
%if %2 == 1 ; avg
add secq, sec_str
%endif
dec h
jg .x_half_y_zero_loop
STORE_AND_RET
.x_half_y_nonzero:
cmp y_offsetd, 8
jne .x_half_y_nonhalf
; x_offset == 0.5 && y_offset == 0.5
%if %1 == 16
movu m0, [srcq]
movu m3, [srcq+1]
add srcq, src_strideq
pavgb m0, m3
.x_half_y_half_loop:
movu m4, [srcq]
movu m3, [srcq+1]
mova m1, [dstq]
pavgb m4, m3
punpckhbw m3, m1, m5
pavgb m0, m4
%if %2 == 1 ; avg
punpcklbw m1, m5
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
%else
punpckhbw m2, m0, m5
punpcklbw m0, m5
punpcklbw m1, m5
%endif
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
add srcq, src_strideq
add dstq, dst_strideq
%else ; %1 < 16
movh m0, [srcq]
movh m3, [srcq+1]
add srcq, src_strideq
pavgb m0, m3
.x_half_y_half_loop:
movh m2, [srcq]
movh m3, [srcq+1]
%if %2 == 1 ; avg
%if mmsize == 16
movhps m2, [srcq+src_strideq]
movhps m3, [srcq+src_strideq+1]
%else
%if %1 == 4
movh m1, [srcq+src_strideq]
punpckldq m2, m1
movh m1, [srcq+src_strideq+1]
punpckldq m3, m1
%else
punpckldq m2, [srcq+src_strideq]
punpckldq m3, [srcq+src_strideq+1]
%endif
%endif
pavgb m2, m3
%if mmsize == 16
movlhps m0, m2
movhlps m4, m2
%else ; mmsize == 8
punpckldq m0, m2
pshufw m4, m2, 0xe
%endif
movh m1, [dstq]
pavgb m0, m2
movh m3, [dstq+dst_strideq]
pavgb m0, [secq]
punpcklbw m3, m5
punpcklbw m1, m5
punpckhbw m2, m0, m5
punpcklbw m0, m5
%else ; !avg
movh m4, [srcq+src_strideq]
movh m1, [srcq+src_strideq+1]
pavgb m2, m3
pavgb m4, m1
pavgb m0, m2
pavgb m2, m4
movh m1, [dstq]
movh m3, [dstq+dst_strideq]
punpcklbw m0, m5
punpcklbw m2, m5
punpcklbw m3, m5
punpcklbw m1, m5
%endif
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
%endif
%if %2 == 1 ; avg
add secq, sec_str
%endif
dec h
jg .x_half_y_half_loop
STORE_AND_RET
.x_half_y_nonhalf:
; x_offset == 0.5 && y_offset == bilin interpolation
%ifdef PIC
lea bilin_filter, [bilin_filter_m]
%endif
shl y_offsetd, filter_idx_shift
%if ARCH_X86_64 && mmsize == 16
mova m8, [bilin_filter+y_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+y_offsetq+16]
%endif
mova m10, [pw_8]
%define filter_y_a m8
%define filter_y_b m9
%define filter_rnd m10
%else ;x86_32
%if ARCH_X86=1 && CONFIG_PIC=1
; x_offset == 0.5. We can reuse x_offset reg
%define tempq x_offsetq
add y_offsetq, g_bilin_filterm
%define filter_y_a [y_offsetq]
%define filter_y_b [y_offsetq+16]
mov tempq, g_pw_8m
%define filter_rnd [tempq]
%else
add y_offsetq, bilin_filter
%define filter_y_a [y_offsetq]
%define filter_y_b [y_offsetq+16]
%define filter_rnd [pw_8]
%endif
%endif
%if %1 == 16
movu m0, [srcq]
movu m3, [srcq+1]
add srcq, src_strideq
pavgb m0, m3
.x_half_y_other_loop:
movu m4, [srcq]
movu m2, [srcq+1]
mova m1, [dstq]
pavgb m4, m2
%if cpuflag(ssse3)
punpckhbw m2, m0, m4
punpcklbw m0, m4
pmaddubsw m2, filter_y_a
pmaddubsw m0, filter_y_a
paddw m2, filter_rnd
paddw m0, filter_rnd
psraw m2, 4
%else
punpckhbw m2, m0, m5
punpckhbw m3, m4, m5
pmullw m2, filter_y_a
pmullw m3, filter_y_b
paddw m2, filter_rnd
punpcklbw m0, m5
paddw m2, m3
punpcklbw m3, m4, m5
pmullw m0, filter_y_a
pmullw m3, filter_y_b
paddw m0, filter_rnd
psraw m2, 4
paddw m0, m3
%endif
punpckhbw m3, m1, m5
psraw m0, 4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
packuswb m0, m2
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
%endif
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
add srcq, src_strideq
add dstq, dst_strideq
%else ; %1 < 16
movh m0, [srcq]
movh m3, [srcq+1]
add srcq, src_strideq
pavgb m0, m3
%if notcpuflag(ssse3)
punpcklbw m0, m5
%endif
.x_half_y_other_loop:
movh m2, [srcq]
movh m1, [srcq+1]
movh m4, [srcq+src_strideq]
movh m3, [srcq+src_strideq+1]
pavgb m2, m1
pavgb m4, m3
movh m3, [dstq+dst_strideq]
%if cpuflag(ssse3)
movh m1, [dstq]
punpcklbw m0, m2
punpcklbw m2, m4
pmaddubsw m0, filter_y_a
pmaddubsw m2, filter_y_a
punpcklbw m3, m5
paddw m0, filter_rnd
paddw m2, filter_rnd
%else
punpcklbw m2, m5
punpcklbw m4, m5
pmullw m0, filter_y_a
pmullw m1, m2, filter_y_b
punpcklbw m3, m5
paddw m0, filter_rnd
pmullw m2, filter_y_a
paddw m0, m1
pmullw m1, m4, filter_y_b
paddw m2, filter_rnd
paddw m2, m1
movh m1, [dstq]
%endif
psraw m0, 4
psraw m2, 4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
packuswb m0, m2
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
%endif
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
%endif
%if %2 == 1 ; avg
add secq, sec_str
%endif
dec h
jg .x_half_y_other_loop
%undef filter_y_a
%undef filter_y_b
%undef filter_rnd
STORE_AND_RET
.x_nonhalf:
test y_offsetd, y_offsetd
jnz .x_nonhalf_y_nonzero
; x_offset == bilin interpolation && y_offset == 0
%ifdef PIC
lea bilin_filter, [bilin_filter_m]
%endif
shl x_offsetd, filter_idx_shift
%if ARCH_X86_64 && mmsize == 16
mova m8, [bilin_filter+x_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+x_offsetq+16]
%endif
mova m10, [pw_8]
%define filter_x_a m8
%define filter_x_b m9
%define filter_rnd m10
%else ; x86-32
%if ARCH_X86=1 && CONFIG_PIC=1
;y_offset == 0. We can reuse y_offset reg.
%define tempq y_offsetq
add x_offsetq, g_bilin_filterm
%define filter_x_a [x_offsetq]
%define filter_x_b [x_offsetq+16]
mov tempq, g_pw_8m
%define filter_rnd [tempq]
%else
add x_offsetq, bilin_filter
%define filter_x_a [x_offsetq]
%define filter_x_b [x_offsetq+16]
%define filter_rnd [pw_8]
%endif
%endif
.x_other_y_zero_loop:
%if %1 == 16
movu m0, [srcq]
movu m4, [srcq+1]
mova m1, [dstq]
%if cpuflag(ssse3)
punpckhbw m2, m0, m4
punpcklbw m0, m4
pmaddubsw m2, filter_x_a
pmaddubsw m0, filter_x_a
paddw m2, filter_rnd
paddw m0, filter_rnd
%else
punpckhbw m2, m0, m5
punpckhbw m3, m4, m5
punpcklbw m0, m5
punpcklbw m4, m5
pmullw m2, filter_x_a
pmullw m3, filter_x_b
paddw m2, filter_rnd
pmullw m0, filter_x_a
pmullw m4, filter_x_b
paddw m0, filter_rnd
paddw m2, m3
paddw m0, m4
%endif
psraw m2, 4
psraw m0, 4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
packuswb m0, m2
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
%endif
punpckhbw m3, m1, m5
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
add srcq, src_strideq
add dstq, dst_strideq
%else ; %1 < 16
movh m0, [srcq]
movh m1, [srcq+1]
movh m2, [srcq+src_strideq]
movh m4, [srcq+src_strideq+1]
movh m3, [dstq+dst_strideq]
%if cpuflag(ssse3)
punpcklbw m0, m1
movh m1, [dstq]
punpcklbw m2, m4
pmaddubsw m0, filter_x_a
pmaddubsw m2, filter_x_a
punpcklbw m3, m5
paddw m0, filter_rnd
paddw m2, filter_rnd
%else
punpcklbw m0, m5
punpcklbw m1, m5
punpcklbw m2, m5
punpcklbw m4, m5
pmullw m0, filter_x_a
pmullw m1, filter_x_b
punpcklbw m3, m5
paddw m0, filter_rnd
pmullw m2, filter_x_a
pmullw m4, filter_x_b
paddw m0, m1
paddw m2, filter_rnd
movh m1, [dstq]
paddw m2, m4
%endif
psraw m0, 4
psraw m2, 4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
packuswb m0, m2
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
%endif
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
%endif
%if %2 == 1 ; avg
add secq, sec_str
%endif
dec h
jg .x_other_y_zero_loop
%undef filter_x_a
%undef filter_x_b
%undef filter_rnd
STORE_AND_RET
.x_nonhalf_y_nonzero:
cmp y_offsetd, 8
jne .x_nonhalf_y_nonhalf
; x_offset == bilin interpolation && y_offset == 0.5
%ifdef PIC
lea bilin_filter, [bilin_filter_m]
%endif
shl x_offsetd, filter_idx_shift
%if ARCH_X86_64 && mmsize == 16
mova m8, [bilin_filter+x_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+x_offsetq+16]
%endif
mova m10, [pw_8]
%define filter_x_a m8
%define filter_x_b m9
%define filter_rnd m10
%else ; x86-32
%if ARCH_X86=1 && CONFIG_PIC=1
; y_offset == 0.5. We can reuse y_offset reg.
%define tempq y_offsetq
add x_offsetq, g_bilin_filterm
%define filter_x_a [x_offsetq]
%define filter_x_b [x_offsetq+16]
mov tempq, g_pw_8m
%define filter_rnd [tempq]
%else
add x_offsetq, bilin_filter
%define filter_x_a [x_offsetq]
%define filter_x_b [x_offsetq+16]
%define filter_rnd [pw_8]
%endif
%endif
%if %1 == 16
movu m0, [srcq]
movu m1, [srcq+1]
%if cpuflag(ssse3)
punpckhbw m2, m0, m1
punpcklbw m0, m1
pmaddubsw m2, filter_x_a
pmaddubsw m0, filter_x_a
paddw m2, filter_rnd
paddw m0, filter_rnd
%else
punpckhbw m2, m0, m5
punpckhbw m3, m1, m5
punpcklbw m0, m5
punpcklbw m1, m5
pmullw m0, filter_x_a
pmullw m1, filter_x_b
paddw m0, filter_rnd
pmullw m2, filter_x_a
pmullw m3, filter_x_b
paddw m2, filter_rnd
paddw m0, m1
paddw m2, m3
%endif
psraw m0, 4
psraw m2, 4
add srcq, src_strideq
packuswb m0, m2
.x_other_y_half_loop:
movu m4, [srcq]
movu m3, [srcq+1]
%if cpuflag(ssse3)
mova m1, [dstq]
punpckhbw m2, m4, m3
punpcklbw m4, m3
pmaddubsw m2, filter_x_a
pmaddubsw m4, filter_x_a
paddw m2, filter_rnd
paddw m4, filter_rnd
psraw m2, 4
psraw m4, 4
packuswb m4, m2
pavgb m0, m4
punpckhbw m3, m1, m5
punpcklbw m1, m5
%else
punpckhbw m2, m4, m5
punpckhbw m1, m3, m5
punpcklbw m4, m5
punpcklbw m3, m5
pmullw m4, filter_x_a
pmullw m3, filter_x_b
paddw m4, filter_rnd
pmullw m2, filter_x_a
pmullw m1, filter_x_b
paddw m2, filter_rnd
paddw m4, m3
paddw m2, m1
mova m1, [dstq]
psraw m4, 4
psraw m2, 4
punpckhbw m3, m1, m5
; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
; have a 1-register shortage to be able to store the backup of the bilin
; filtered second line as words as cache for the next line. Packing into
; a byte costs 1 pack and 2 unpacks, but saves a register.
packuswb m4, m2
punpcklbw m1, m5
pavgb m0, m4
%endif
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
pavgb m0, [secq]
%endif
punpckhbw m2, m0, m5
punpcklbw m0, m5
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
add srcq, src_strideq
add dstq, dst_strideq
%else ; %1 < 16
movh m0, [srcq]
movh m1, [srcq+1]
%if cpuflag(ssse3)
punpcklbw m0, m1
pmaddubsw m0, filter_x_a
paddw m0, filter_rnd
%else
punpcklbw m0, m5
punpcklbw m1, m5
pmullw m0, filter_x_a
pmullw m1, filter_x_b
paddw m0, filter_rnd
paddw m0, m1
%endif
add srcq, src_strideq
psraw m0, 4
.x_other_y_half_loop:
movh m2, [srcq]
movh m1, [srcq+1]
movh m4, [srcq+src_strideq]
movh m3, [srcq+src_strideq+1]
%if cpuflag(ssse3)
punpcklbw m2, m1
punpcklbw m4, m3
pmaddubsw m2, filter_x_a
pmaddubsw m4, filter_x_a
movh m1, [dstq]
movh m3, [dstq+dst_strideq]
paddw m2, filter_rnd
paddw m4, filter_rnd
%else
punpcklbw m2, m5
punpcklbw m1, m5
punpcklbw m4, m5
punpcklbw m3, m5
pmullw m2, filter_x_a
pmullw m1, filter_x_b
paddw m2, filter_rnd
pmullw m4, filter_x_a
pmullw m3, filter_x_b
paddw m4, filter_rnd
paddw m2, m1
movh m1, [dstq]
paddw m4, m3
movh m3, [dstq+dst_strideq]
%endif
psraw m2, 4
psraw m4, 4
pavgw m0, m2
pavgw m2, m4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline - also consider going to bytes here
packuswb m0, m2
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
%endif
punpcklbw m3, m5
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
%endif
%if %2 == 1 ; avg
add secq, sec_str
%endif
dec h
jg .x_other_y_half_loop
%undef filter_x_a
%undef filter_x_b
%undef filter_rnd
STORE_AND_RET
.x_nonhalf_y_nonhalf:
%ifdef PIC
lea bilin_filter, [bilin_filter_m]
%endif
shl x_offsetd, filter_idx_shift
shl y_offsetd, filter_idx_shift
%if ARCH_X86_64 && mmsize == 16
mova m8, [bilin_filter+x_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+x_offsetq+16]
%endif
mova m10, [bilin_filter+y_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m11, [bilin_filter+y_offsetq+16]
%endif
mova m12, [pw_8]
%define filter_x_a m8
%define filter_x_b m9
%define filter_y_a m10
%define filter_y_b m11
%define filter_rnd m12
%else ; x86-32
%if ARCH_X86=1 && CONFIG_PIC=1
; In this case, there is NO unused register. Used src_stride register. Later,
; src_stride has to be loaded from stack when it is needed.
%define tempq src_strideq
mov tempq, g_bilin_filterm
add x_offsetq, tempq
add y_offsetq, tempq
%define filter_x_a [x_offsetq]
%define filter_x_b [x_offsetq+16]
%define filter_y_a [y_offsetq]
%define filter_y_b [y_offsetq+16]
mov tempq, g_pw_8m
%define filter_rnd [tempq]
%else
add x_offsetq, bilin_filter
add y_offsetq, bilin_filter
%define filter_x_a [x_offsetq]
%define filter_x_b [x_offsetq+16]
%define filter_y_a [y_offsetq]
%define filter_y_b [y_offsetq+16]
%define filter_rnd [pw_8]
%endif
%endif
; x_offset == bilin interpolation && y_offset == bilin interpolation
%if %1 == 16
movu m0, [srcq]
movu m1, [srcq+1]
%if cpuflag(ssse3)
punpckhbw m2, m0, m1
punpcklbw m0, m1
pmaddubsw m2, filter_x_a
pmaddubsw m0, filter_x_a
paddw m2, filter_rnd
paddw m0, filter_rnd
%else
punpckhbw m2, m0, m5
punpckhbw m3, m1, m5
punpcklbw m0, m5
punpcklbw m1, m5
pmullw m0, filter_x_a
pmullw m1, filter_x_b
paddw m0, filter_rnd
pmullw m2, filter_x_a
pmullw m3, filter_x_b
paddw m2, filter_rnd
paddw m0, m1
paddw m2, m3
%endif
psraw m0, 4
psraw m2, 4
INC_SRC_BY_SRC_STRIDE
packuswb m0, m2
.x_other_y_other_loop:
%if cpuflag(ssse3)
movu m4, [srcq]
movu m3, [srcq+1]
mova m1, [dstq]
punpckhbw m2, m4, m3
punpcklbw m4, m3
pmaddubsw m2, filter_x_a
pmaddubsw m4, filter_x_a
punpckhbw m3, m1, m5
paddw m2, filter_rnd
paddw m4, filter_rnd
psraw m2, 4
psraw m4, 4
packuswb m4, m2
punpckhbw m2, m0, m4
punpcklbw m0, m4
pmaddubsw m2, filter_y_a
pmaddubsw m0, filter_y_a
punpcklbw m1, m5
paddw m2, filter_rnd
paddw m0, filter_rnd
psraw m2, 4
psraw m0, 4
%else
movu m3, [srcq]
movu m4, [srcq+1]
punpckhbw m1, m3, m5
punpckhbw m2, m4, m5
punpcklbw m3, m5
punpcklbw m4, m5
pmullw m3, filter_x_a
pmullw m4, filter_x_b
paddw m3, filter_rnd
pmullw m1, filter_x_a
pmullw m2, filter_x_b
paddw m1, filter_rnd
paddw m3, m4
paddw m1, m2
psraw m3, 4
psraw m1, 4
packuswb m4, m3, m1
punpckhbw m2, m0, m5
punpcklbw m0, m5
pmullw m2, filter_y_a
pmullw m1, filter_y_b
paddw m2, filter_rnd
pmullw m0, filter_y_a
pmullw m3, filter_y_b
paddw m2, m1
mova m1, [dstq]
paddw m0, filter_rnd
psraw m2, 4
paddw m0, m3
punpckhbw m3, m1, m5
psraw m0, 4
punpcklbw m1, m5
%endif
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
packuswb m0, m2
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
%endif
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
INC_SRC_BY_SRC_STRIDE
add dstq, dst_strideq
%else ; %1 < 16
movh m0, [srcq]
movh m1, [srcq+1]
%if cpuflag(ssse3)
punpcklbw m0, m1
pmaddubsw m0, filter_x_a
paddw m0, filter_rnd
%else
punpcklbw m0, m5
punpcklbw m1, m5
pmullw m0, filter_x_a
pmullw m1, filter_x_b
paddw m0, filter_rnd
paddw m0, m1
%endif
psraw m0, 4
%if cpuflag(ssse3)
packuswb m0, m0
%endif
INC_SRC_BY_SRC_STRIDE
.x_other_y_other_loop:
movh m2, [srcq]
movh m1, [srcq+1]
INC_SRC_BY_SRC_STRIDE
movh m4, [srcq]
movh m3, [srcq+1]
%if cpuflag(ssse3)
punpcklbw m2, m1
punpcklbw m4, m3
pmaddubsw m2, filter_x_a
pmaddubsw m4, filter_x_a
movh m3, [dstq+dst_strideq]
movh m1, [dstq]
paddw m2, filter_rnd
paddw m4, filter_rnd
psraw m2, 4
psraw m4, 4
packuswb m2, m2
packuswb m4, m4
punpcklbw m0, m2
punpcklbw m2, m4
pmaddubsw m0, filter_y_a
pmaddubsw m2, filter_y_a
punpcklbw m3, m5
paddw m0, filter_rnd
paddw m2, filter_rnd
psraw m0, 4
psraw m2, 4
punpcklbw m1, m5
%else
punpcklbw m2, m5
punpcklbw m1, m5
punpcklbw m4, m5
punpcklbw m3, m5
pmullw m2, filter_x_a
pmullw m1, filter_x_b
paddw m2, filter_rnd
pmullw m4, filter_x_a
pmullw m3, filter_x_b
paddw m4, filter_rnd
paddw m2, m1
paddw m4, m3
psraw m2, 4
psraw m4, 4
pmullw m0, filter_y_a
pmullw m3, m2, filter_y_b
paddw m0, filter_rnd
pmullw m2, filter_y_a
pmullw m1, m4, filter_y_b
paddw m2, filter_rnd
paddw m0, m3
movh m3, [dstq+dst_strideq]
paddw m2, m1
movh m1, [dstq]
psraw m0, 4
psraw m2, 4
punpcklbw m3, m5
punpcklbw m1, m5
%endif
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
packuswb m0, m2
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
%endif
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
INC_SRC_BY_SRC_STRIDE
lea dstq, [dstq+dst_strideq*2]
%endif
%if %2 == 1 ; avg
add secq, sec_str
%endif
dec h
jg .x_other_y_other_loop
%undef filter_x_a
%undef filter_x_b
%undef filter_y_a
%undef filter_y_b
%undef filter_rnd
STORE_AND_RET
%endmacro
; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
; between the ssse3 and non-ssse3 version. It may make sense to merge their
; code in the sense that the ssse3 version would jump to the appropriate
; location in the sse/2 version, rather than duplicating that code in the
; binary.
INIT_MMX sse
SUBPEL_VARIANCE 4
INIT_XMM sse2
SUBPEL_VARIANCE 8
SUBPEL_VARIANCE 16
INIT_MMX ssse3
SUBPEL_VARIANCE 4
INIT_XMM ssse3
SUBPEL_VARIANCE 8
SUBPEL_VARIANCE 16
INIT_MMX sse
SUBPEL_VARIANCE 4, 1
INIT_XMM sse2
SUBPEL_VARIANCE 8, 1
SUBPEL_VARIANCE 16, 1
INIT_MMX ssse3
SUBPEL_VARIANCE 4, 1
INIT_XMM ssse3
SUBPEL_VARIANCE 8, 1
SUBPEL_VARIANCE 16, 1