eb88b172fe
The only difference between the two was that the vp9 function allowed for every step in the bilinear filter (16 steps) while vp8 only allowed for half of those. Since all the call sites in vp9 (<< 1) the input, it only ever used the same steps as vp8. This will allow moving the subpel variance to vpx_dsp with the rest of the variance functions. Change-Id: I6fa2509350a2dc610c46b3e15bde98a15a084b75
1397 lines
41 KiB
NASM
1397 lines
41 KiB
NASM
;
|
|
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
|
;
|
|
; Use of this source code is governed by a BSD-style license
|
|
; that can be found in the LICENSE file in the root of the source
|
|
; tree. An additional intellectual property rights grant can be found
|
|
; in the file PATENTS. All contributing project authors may
|
|
; be found in the AUTHORS file in the root of the source tree.
|
|
;
|
|
|
|
%include "third_party/x86inc/x86inc.asm"
|
|
|
|
SECTION_RODATA
|
|
pw_8: times 8 dw 8
|
|
bilin_filter_m_sse2: times 8 dw 16
|
|
times 8 dw 0
|
|
times 8 dw 14
|
|
times 8 dw 2
|
|
times 8 dw 12
|
|
times 8 dw 4
|
|
times 8 dw 10
|
|
times 8 dw 6
|
|
times 16 dw 8
|
|
times 8 dw 6
|
|
times 8 dw 10
|
|
times 8 dw 4
|
|
times 8 dw 12
|
|
times 8 dw 2
|
|
times 8 dw 14
|
|
|
|
bilin_filter_m_ssse3: times 8 db 16, 0
|
|
times 8 db 14, 2
|
|
times 8 db 12, 4
|
|
times 8 db 10, 6
|
|
times 16 db 8
|
|
times 8 db 6, 10
|
|
times 8 db 4, 12
|
|
times 8 db 2, 14
|
|
|
|
SECTION .text
|
|
|
|
; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
|
|
; int x_offset, int y_offset,
|
|
; const uint8_t *dst, ptrdiff_t dst_stride,
|
|
; int height, unsigned int *sse);
|
|
;
|
|
; This function returns the SE and stores SSE in the given pointer.
|
|
|
|
%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
|
|
psubw %3, %4
|
|
psubw %1, %2
|
|
paddw %5, %3
|
|
pmaddwd %3, %3
|
|
paddw %5, %1
|
|
pmaddwd %1, %1
|
|
paddd %6, %3
|
|
paddd %6, %1
|
|
%endmacro
|
|
|
|
%macro STORE_AND_RET 0
|
|
%if mmsize == 16
|
|
; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
|
|
; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
|
|
; We have to sign-extend it before adding the words within the register
|
|
; and outputing to a dword.
|
|
pcmpgtw m5, m6 ; mask for 0 > x
|
|
movhlps m3, m7
|
|
punpcklwd m4, m6, m5
|
|
punpckhwd m6, m5 ; sign-extend m6 word->dword
|
|
paddd m7, m3
|
|
paddd m6, m4
|
|
pshufd m3, m7, 0x1
|
|
movhlps m4, m6
|
|
paddd m7, m3
|
|
paddd m6, m4
|
|
mov r1, ssem ; r1 = unsigned int *sse
|
|
pshufd m4, m6, 0x1
|
|
movd [r1], m7 ; store sse
|
|
paddd m6, m4
|
|
movd raxd, m6 ; store sum as return value
|
|
%else ; mmsize == 8
|
|
pshufw m4, m6, 0xe
|
|
pshufw m3, m7, 0xe
|
|
paddw m6, m4
|
|
paddd m7, m3
|
|
pcmpgtw m5, m6 ; mask for 0 > x
|
|
mov r1, ssem ; r1 = unsigned int *sse
|
|
punpcklwd m6, m5 ; sign-extend m6 word->dword
|
|
movd [r1], m7 ; store sse
|
|
pshufw m4, m6, 0xe
|
|
paddd m6, m4
|
|
movd raxd, m6 ; store sum as return value
|
|
%endif
|
|
RET
|
|
%endmacro
|
|
|
|
%macro INC_SRC_BY_SRC_STRIDE 0
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
add srcq, src_stridemp
|
|
%else
|
|
add srcq, src_strideq
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro SUBPEL_VARIANCE 1-2 0 ; W
|
|
%if cpuflag(ssse3)
|
|
%define bilin_filter_m bilin_filter_m_ssse3
|
|
%define filter_idx_shift 4
|
|
%else
|
|
%define bilin_filter_m bilin_filter_m_sse2
|
|
%define filter_idx_shift 5
|
|
%endif
|
|
; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
|
|
; 11, not 13, if the registers are ordered correctly. May make a minor speed
|
|
; difference on Win64
|
|
|
|
%ifdef PIC ; 64bit PIC
|
|
%if %2 == 1 ; avg
|
|
cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
|
|
x_offset, y_offset, \
|
|
dst, dst_stride, \
|
|
sec, sec_stride, height, sse
|
|
%define sec_str sec_strideq
|
|
%else
|
|
cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
|
|
y_offset, dst, dst_stride, height, sse
|
|
%endif
|
|
%define h heightd
|
|
%define bilin_filter sseq
|
|
%else
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
%if %2 == 1 ; avg
|
|
cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
|
|
x_offset, y_offset, \
|
|
dst, dst_stride, \
|
|
sec, sec_stride, \
|
|
height, sse, g_bilin_filter, g_pw_8
|
|
%define h dword heightm
|
|
%define sec_str sec_stridemp
|
|
|
|
;Store bilin_filter and pw_8 location in stack
|
|
GET_GOT eax
|
|
add esp, 4 ; restore esp
|
|
|
|
lea ecx, [GLOBAL(bilin_filter_m)]
|
|
mov g_bilin_filterm, ecx
|
|
|
|
lea ecx, [GLOBAL(pw_8)]
|
|
mov g_pw_8m, ecx
|
|
|
|
LOAD_IF_USED 0, 1 ; load eax, ecx back
|
|
%else
|
|
cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
|
|
y_offset, dst, dst_stride, height, sse, \
|
|
g_bilin_filter, g_pw_8
|
|
%define h heightd
|
|
|
|
;Store bilin_filter and pw_8 location in stack
|
|
GET_GOT eax
|
|
add esp, 4 ; restore esp
|
|
|
|
lea ecx, [GLOBAL(bilin_filter_m)]
|
|
mov g_bilin_filterm, ecx
|
|
|
|
lea ecx, [GLOBAL(pw_8)]
|
|
mov g_pw_8m, ecx
|
|
|
|
LOAD_IF_USED 0, 1 ; load eax, ecx back
|
|
%endif
|
|
%else
|
|
%if %2 == 1 ; avg
|
|
cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
|
|
7 + 2 * ARCH_X86_64, 13, src, src_stride, \
|
|
x_offset, y_offset, \
|
|
dst, dst_stride, \
|
|
sec, sec_stride, \
|
|
height, sse
|
|
%if ARCH_X86_64
|
|
%define h heightd
|
|
%define sec_str sec_strideq
|
|
%else
|
|
%define h dword heightm
|
|
%define sec_str sec_stridemp
|
|
%endif
|
|
%else
|
|
cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
|
|
y_offset, dst, dst_stride, height, sse
|
|
%define h heightd
|
|
%endif
|
|
|
|
%define bilin_filter bilin_filter_m
|
|
%endif
|
|
%endif
|
|
|
|
ASSERT %1 <= 16 ; m6 overflows if w > 16
|
|
pxor m6, m6 ; sum
|
|
pxor m7, m7 ; sse
|
|
; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
|
|
; could perhaps use it for something more productive then
|
|
pxor m5, m5 ; dedicated zero register
|
|
%if %1 < 16
|
|
sar h, 1
|
|
%if %2 == 1 ; avg
|
|
shl sec_str, 1
|
|
%endif
|
|
%endif
|
|
|
|
; FIXME(rbultje) replace by jumptable?
|
|
test x_offsetd, x_offsetd
|
|
jnz .x_nonzero
|
|
; x_offset == 0
|
|
test y_offsetd, y_offsetd
|
|
jnz .x_zero_y_nonzero
|
|
|
|
; x_offset == 0 && y_offset == 0
|
|
.x_zero_y_zero_loop:
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
mova m1, [dstq]
|
|
%if %2 == 1 ; avg
|
|
pavgb m0, [secq]
|
|
punpckhbw m3, m1, m5
|
|
punpcklbw m1, m5
|
|
%endif
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%if %2 == 0 ; !avg
|
|
punpckhbw m3, m1, m5
|
|
punpcklbw m1, m5
|
|
%endif
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
add srcq, src_strideq
|
|
add dstq, dst_strideq
|
|
%else ; %1 < 16
|
|
movh m0, [srcq]
|
|
%if %2 == 1 ; avg
|
|
%if mmsize == 16
|
|
movhps m0, [srcq+src_strideq]
|
|
%else ; mmsize == 8
|
|
punpckldq m0, [srcq+src_strideq]
|
|
%endif
|
|
%else ; !avg
|
|
movh m2, [srcq+src_strideq]
|
|
%endif
|
|
movh m1, [dstq]
|
|
movh m3, [dstq+dst_strideq]
|
|
%if %2 == 1 ; avg
|
|
pavgb m0, [secq]
|
|
punpcklbw m3, m5
|
|
punpcklbw m1, m5
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%else ; !avg
|
|
punpcklbw m0, m5
|
|
punpcklbw m2, m5
|
|
punpcklbw m3, m5
|
|
punpcklbw m1, m5
|
|
%endif
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
lea srcq, [srcq+src_strideq*2]
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
dec h
|
|
jg .x_zero_y_zero_loop
|
|
STORE_AND_RET
|
|
|
|
.x_zero_y_nonzero:
|
|
cmp y_offsetd, 8
|
|
jne .x_zero_y_nonhalf
|
|
|
|
; x_offset == 0 && y_offset == 0.5
|
|
.x_zero_y_half_loop:
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m4, [srcq+src_strideq]
|
|
mova m1, [dstq]
|
|
pavgb m0, m4
|
|
punpckhbw m3, m1, m5
|
|
%if %2 == 1 ; avg
|
|
pavgb m0, [secq]
|
|
%endif
|
|
punpcklbw m1, m5
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
add srcq, src_strideq
|
|
add dstq, dst_strideq
|
|
%else ; %1 < 16
|
|
movh m0, [srcq]
|
|
movh m2, [srcq+src_strideq]
|
|
%if %2 == 1 ; avg
|
|
%if mmsize == 16
|
|
movhps m2, [srcq+src_strideq*2]
|
|
%else ; mmsize == 8
|
|
%if %1 == 4
|
|
movh m1, [srcq+src_strideq*2]
|
|
punpckldq m2, m1
|
|
%else
|
|
punpckldq m2, [srcq+src_strideq*2]
|
|
%endif
|
|
%endif
|
|
movh m1, [dstq]
|
|
%if mmsize == 16
|
|
movlhps m0, m2
|
|
%else ; mmsize == 8
|
|
punpckldq m0, m2
|
|
%endif
|
|
movh m3, [dstq+dst_strideq]
|
|
pavgb m0, m2
|
|
punpcklbw m1, m5
|
|
pavgb m0, [secq]
|
|
punpcklbw m3, m5
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%else ; !avg
|
|
movh m4, [srcq+src_strideq*2]
|
|
movh m1, [dstq]
|
|
pavgb m0, m2
|
|
movh m3, [dstq+dst_strideq]
|
|
pavgb m2, m4
|
|
punpcklbw m0, m5
|
|
punpcklbw m2, m5
|
|
punpcklbw m3, m5
|
|
punpcklbw m1, m5
|
|
%endif
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
lea srcq, [srcq+src_strideq*2]
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
dec h
|
|
jg .x_zero_y_half_loop
|
|
STORE_AND_RET
|
|
|
|
.x_zero_y_nonhalf:
|
|
; x_offset == 0 && y_offset == bilin interpolation
|
|
%ifdef PIC
|
|
lea bilin_filter, [bilin_filter_m]
|
|
%endif
|
|
shl y_offsetd, filter_idx_shift
|
|
%if ARCH_X86_64 && mmsize == 16
|
|
mova m8, [bilin_filter+y_offsetq]
|
|
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
|
|
mova m9, [bilin_filter+y_offsetq+16]
|
|
%endif
|
|
mova m10, [pw_8]
|
|
%define filter_y_a m8
|
|
%define filter_y_b m9
|
|
%define filter_rnd m10
|
|
%else ; x86-32 or mmx
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
; x_offset == 0, reuse x_offset reg
|
|
%define tempq x_offsetq
|
|
add y_offsetq, g_bilin_filterm
|
|
%define filter_y_a [y_offsetq]
|
|
%define filter_y_b [y_offsetq+16]
|
|
mov tempq, g_pw_8m
|
|
%define filter_rnd [tempq]
|
|
%else
|
|
add y_offsetq, bilin_filter
|
|
%define filter_y_a [y_offsetq]
|
|
%define filter_y_b [y_offsetq+16]
|
|
%define filter_rnd [pw_8]
|
|
%endif
|
|
%endif
|
|
|
|
.x_zero_y_other_loop:
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m4, [srcq+src_strideq]
|
|
mova m1, [dstq]
|
|
%if cpuflag(ssse3)
|
|
punpckhbw m2, m0, m4
|
|
punpcklbw m0, m4
|
|
pmaddubsw m2, filter_y_a
|
|
pmaddubsw m0, filter_y_a
|
|
paddw m2, filter_rnd
|
|
paddw m0, filter_rnd
|
|
%else
|
|
punpckhbw m2, m0, m5
|
|
punpckhbw m3, m4, m5
|
|
punpcklbw m0, m5
|
|
punpcklbw m4, m5
|
|
; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
|
|
; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
|
|
; instructions is the same (5), but it is 1 mul instead of 2, so might be
|
|
; slightly faster because of pmullw latency. It would also cut our rodata
|
|
; tables in half for this function, and save 1-2 registers on x86-64.
|
|
pmullw m2, filter_y_a
|
|
pmullw m3, filter_y_b
|
|
paddw m2, filter_rnd
|
|
pmullw m0, filter_y_a
|
|
pmullw m4, filter_y_b
|
|
paddw m0, filter_rnd
|
|
paddw m2, m3
|
|
paddw m0, m4
|
|
%endif
|
|
psraw m2, 4
|
|
psraw m0, 4
|
|
%if %2 == 1 ; avg
|
|
; FIXME(rbultje) pipeline
|
|
packuswb m0, m2
|
|
pavgb m0, [secq]
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%endif
|
|
punpckhbw m3, m1, m5
|
|
punpcklbw m1, m5
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
add srcq, src_strideq
|
|
add dstq, dst_strideq
|
|
%else ; %1 < 16
|
|
movh m0, [srcq]
|
|
movh m2, [srcq+src_strideq]
|
|
movh m4, [srcq+src_strideq*2]
|
|
movh m3, [dstq+dst_strideq]
|
|
%if cpuflag(ssse3)
|
|
movh m1, [dstq]
|
|
punpcklbw m0, m2
|
|
punpcklbw m2, m4
|
|
pmaddubsw m0, filter_y_a
|
|
pmaddubsw m2, filter_y_a
|
|
punpcklbw m3, m5
|
|
paddw m2, filter_rnd
|
|
paddw m0, filter_rnd
|
|
%else
|
|
punpcklbw m0, m5
|
|
punpcklbw m2, m5
|
|
punpcklbw m4, m5
|
|
pmullw m0, filter_y_a
|
|
pmullw m1, m2, filter_y_b
|
|
punpcklbw m3, m5
|
|
paddw m0, filter_rnd
|
|
pmullw m2, filter_y_a
|
|
pmullw m4, filter_y_b
|
|
paddw m0, m1
|
|
paddw m2, filter_rnd
|
|
movh m1, [dstq]
|
|
paddw m2, m4
|
|
%endif
|
|
psraw m0, 4
|
|
psraw m2, 4
|
|
%if %2 == 1 ; avg
|
|
; FIXME(rbultje) pipeline
|
|
packuswb m0, m2
|
|
pavgb m0, [secq]
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%endif
|
|
punpcklbw m1, m5
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
lea srcq, [srcq+src_strideq*2]
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
dec h
|
|
jg .x_zero_y_other_loop
|
|
%undef filter_y_a
|
|
%undef filter_y_b
|
|
%undef filter_rnd
|
|
STORE_AND_RET
|
|
|
|
.x_nonzero:
|
|
cmp x_offsetd, 8
|
|
jne .x_nonhalf
|
|
; x_offset == 0.5
|
|
test y_offsetd, y_offsetd
|
|
jnz .x_half_y_nonzero
|
|
|
|
; x_offset == 0.5 && y_offset == 0
|
|
.x_half_y_zero_loop:
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m4, [srcq+1]
|
|
mova m1, [dstq]
|
|
pavgb m0, m4
|
|
punpckhbw m3, m1, m5
|
|
%if %2 == 1 ; avg
|
|
pavgb m0, [secq]
|
|
%endif
|
|
punpcklbw m1, m5
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
add srcq, src_strideq
|
|
add dstq, dst_strideq
|
|
%else ; %1 < 16
|
|
movh m0, [srcq]
|
|
movh m4, [srcq+1]
|
|
%if %2 == 1 ; avg
|
|
%if mmsize == 16
|
|
movhps m0, [srcq+src_strideq]
|
|
movhps m4, [srcq+src_strideq+1]
|
|
%else ; mmsize == 8
|
|
punpckldq m0, [srcq+src_strideq]
|
|
punpckldq m4, [srcq+src_strideq+1]
|
|
%endif
|
|
movh m1, [dstq]
|
|
movh m3, [dstq+dst_strideq]
|
|
pavgb m0, m4
|
|
punpcklbw m3, m5
|
|
pavgb m0, [secq]
|
|
punpcklbw m1, m5
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%else ; !avg
|
|
movh m2, [srcq+src_strideq]
|
|
movh m1, [dstq]
|
|
pavgb m0, m4
|
|
movh m4, [srcq+src_strideq+1]
|
|
movh m3, [dstq+dst_strideq]
|
|
pavgb m2, m4
|
|
punpcklbw m0, m5
|
|
punpcklbw m2, m5
|
|
punpcklbw m3, m5
|
|
punpcklbw m1, m5
|
|
%endif
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
lea srcq, [srcq+src_strideq*2]
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
dec h
|
|
jg .x_half_y_zero_loop
|
|
STORE_AND_RET
|
|
|
|
.x_half_y_nonzero:
|
|
cmp y_offsetd, 8
|
|
jne .x_half_y_nonhalf
|
|
|
|
; x_offset == 0.5 && y_offset == 0.5
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m3, [srcq+1]
|
|
add srcq, src_strideq
|
|
pavgb m0, m3
|
|
.x_half_y_half_loop:
|
|
movu m4, [srcq]
|
|
movu m3, [srcq+1]
|
|
mova m1, [dstq]
|
|
pavgb m4, m3
|
|
punpckhbw m3, m1, m5
|
|
pavgb m0, m4
|
|
%if %2 == 1 ; avg
|
|
punpcklbw m1, m5
|
|
pavgb m0, [secq]
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%else
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
punpcklbw m1, m5
|
|
%endif
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
mova m0, m4
|
|
|
|
add srcq, src_strideq
|
|
add dstq, dst_strideq
|
|
%else ; %1 < 16
|
|
movh m0, [srcq]
|
|
movh m3, [srcq+1]
|
|
add srcq, src_strideq
|
|
pavgb m0, m3
|
|
.x_half_y_half_loop:
|
|
movh m2, [srcq]
|
|
movh m3, [srcq+1]
|
|
%if %2 == 1 ; avg
|
|
%if mmsize == 16
|
|
movhps m2, [srcq+src_strideq]
|
|
movhps m3, [srcq+src_strideq+1]
|
|
%else
|
|
%if %1 == 4
|
|
movh m1, [srcq+src_strideq]
|
|
punpckldq m2, m1
|
|
movh m1, [srcq+src_strideq+1]
|
|
punpckldq m3, m1
|
|
%else
|
|
punpckldq m2, [srcq+src_strideq]
|
|
punpckldq m3, [srcq+src_strideq+1]
|
|
%endif
|
|
%endif
|
|
pavgb m2, m3
|
|
%if mmsize == 16
|
|
movlhps m0, m2
|
|
movhlps m4, m2
|
|
%else ; mmsize == 8
|
|
punpckldq m0, m2
|
|
pshufw m4, m2, 0xe
|
|
%endif
|
|
movh m1, [dstq]
|
|
pavgb m0, m2
|
|
movh m3, [dstq+dst_strideq]
|
|
pavgb m0, [secq]
|
|
punpcklbw m3, m5
|
|
punpcklbw m1, m5
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%else ; !avg
|
|
movh m4, [srcq+src_strideq]
|
|
movh m1, [srcq+src_strideq+1]
|
|
pavgb m2, m3
|
|
pavgb m4, m1
|
|
pavgb m0, m2
|
|
pavgb m2, m4
|
|
movh m1, [dstq]
|
|
movh m3, [dstq+dst_strideq]
|
|
punpcklbw m0, m5
|
|
punpcklbw m2, m5
|
|
punpcklbw m3, m5
|
|
punpcklbw m1, m5
|
|
%endif
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
mova m0, m4
|
|
|
|
lea srcq, [srcq+src_strideq*2]
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
dec h
|
|
jg .x_half_y_half_loop
|
|
STORE_AND_RET
|
|
|
|
.x_half_y_nonhalf:
|
|
; x_offset == 0.5 && y_offset == bilin interpolation
|
|
%ifdef PIC
|
|
lea bilin_filter, [bilin_filter_m]
|
|
%endif
|
|
shl y_offsetd, filter_idx_shift
|
|
%if ARCH_X86_64 && mmsize == 16
|
|
mova m8, [bilin_filter+y_offsetq]
|
|
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
|
|
mova m9, [bilin_filter+y_offsetq+16]
|
|
%endif
|
|
mova m10, [pw_8]
|
|
%define filter_y_a m8
|
|
%define filter_y_b m9
|
|
%define filter_rnd m10
|
|
%else ;x86_32
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
; x_offset == 0.5. We can reuse x_offset reg
|
|
%define tempq x_offsetq
|
|
add y_offsetq, g_bilin_filterm
|
|
%define filter_y_a [y_offsetq]
|
|
%define filter_y_b [y_offsetq+16]
|
|
mov tempq, g_pw_8m
|
|
%define filter_rnd [tempq]
|
|
%else
|
|
add y_offsetq, bilin_filter
|
|
%define filter_y_a [y_offsetq]
|
|
%define filter_y_b [y_offsetq+16]
|
|
%define filter_rnd [pw_8]
|
|
%endif
|
|
%endif
|
|
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m3, [srcq+1]
|
|
add srcq, src_strideq
|
|
pavgb m0, m3
|
|
.x_half_y_other_loop:
|
|
movu m4, [srcq]
|
|
movu m2, [srcq+1]
|
|
mova m1, [dstq]
|
|
pavgb m4, m2
|
|
%if cpuflag(ssse3)
|
|
punpckhbw m2, m0, m4
|
|
punpcklbw m0, m4
|
|
pmaddubsw m2, filter_y_a
|
|
pmaddubsw m0, filter_y_a
|
|
paddw m2, filter_rnd
|
|
paddw m0, filter_rnd
|
|
psraw m2, 4
|
|
%else
|
|
punpckhbw m2, m0, m5
|
|
punpckhbw m3, m4, m5
|
|
pmullw m2, filter_y_a
|
|
pmullw m3, filter_y_b
|
|
paddw m2, filter_rnd
|
|
punpcklbw m0, m5
|
|
paddw m2, m3
|
|
punpcklbw m3, m4, m5
|
|
pmullw m0, filter_y_a
|
|
pmullw m3, filter_y_b
|
|
paddw m0, filter_rnd
|
|
psraw m2, 4
|
|
paddw m0, m3
|
|
%endif
|
|
punpckhbw m3, m1, m5
|
|
psraw m0, 4
|
|
%if %2 == 1 ; avg
|
|
; FIXME(rbultje) pipeline
|
|
packuswb m0, m2
|
|
pavgb m0, [secq]
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%endif
|
|
punpcklbw m1, m5
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
mova m0, m4
|
|
|
|
add srcq, src_strideq
|
|
add dstq, dst_strideq
|
|
%else ; %1 < 16
|
|
movh m0, [srcq]
|
|
movh m3, [srcq+1]
|
|
add srcq, src_strideq
|
|
pavgb m0, m3
|
|
%if notcpuflag(ssse3)
|
|
punpcklbw m0, m5
|
|
%endif
|
|
.x_half_y_other_loop:
|
|
movh m2, [srcq]
|
|
movh m1, [srcq+1]
|
|
movh m4, [srcq+src_strideq]
|
|
movh m3, [srcq+src_strideq+1]
|
|
pavgb m2, m1
|
|
pavgb m4, m3
|
|
movh m3, [dstq+dst_strideq]
|
|
%if cpuflag(ssse3)
|
|
movh m1, [dstq]
|
|
punpcklbw m0, m2
|
|
punpcklbw m2, m4
|
|
pmaddubsw m0, filter_y_a
|
|
pmaddubsw m2, filter_y_a
|
|
punpcklbw m3, m5
|
|
paddw m0, filter_rnd
|
|
paddw m2, filter_rnd
|
|
%else
|
|
punpcklbw m2, m5
|
|
punpcklbw m4, m5
|
|
pmullw m0, filter_y_a
|
|
pmullw m1, m2, filter_y_b
|
|
punpcklbw m3, m5
|
|
paddw m0, filter_rnd
|
|
pmullw m2, filter_y_a
|
|
paddw m0, m1
|
|
pmullw m1, m4, filter_y_b
|
|
paddw m2, filter_rnd
|
|
paddw m2, m1
|
|
movh m1, [dstq]
|
|
%endif
|
|
psraw m0, 4
|
|
psraw m2, 4
|
|
%if %2 == 1 ; avg
|
|
; FIXME(rbultje) pipeline
|
|
packuswb m0, m2
|
|
pavgb m0, [secq]
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%endif
|
|
punpcklbw m1, m5
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
mova m0, m4
|
|
|
|
lea srcq, [srcq+src_strideq*2]
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
dec h
|
|
jg .x_half_y_other_loop
|
|
%undef filter_y_a
|
|
%undef filter_y_b
|
|
%undef filter_rnd
|
|
STORE_AND_RET
|
|
|
|
.x_nonhalf:
|
|
test y_offsetd, y_offsetd
|
|
jnz .x_nonhalf_y_nonzero
|
|
|
|
; x_offset == bilin interpolation && y_offset == 0
|
|
%ifdef PIC
|
|
lea bilin_filter, [bilin_filter_m]
|
|
%endif
|
|
shl x_offsetd, filter_idx_shift
|
|
%if ARCH_X86_64 && mmsize == 16
|
|
mova m8, [bilin_filter+x_offsetq]
|
|
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
|
|
mova m9, [bilin_filter+x_offsetq+16]
|
|
%endif
|
|
mova m10, [pw_8]
|
|
%define filter_x_a m8
|
|
%define filter_x_b m9
|
|
%define filter_rnd m10
|
|
%else ; x86-32
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
;y_offset == 0. We can reuse y_offset reg.
|
|
%define tempq y_offsetq
|
|
add x_offsetq, g_bilin_filterm
|
|
%define filter_x_a [x_offsetq]
|
|
%define filter_x_b [x_offsetq+16]
|
|
mov tempq, g_pw_8m
|
|
%define filter_rnd [tempq]
|
|
%else
|
|
add x_offsetq, bilin_filter
|
|
%define filter_x_a [x_offsetq]
|
|
%define filter_x_b [x_offsetq+16]
|
|
%define filter_rnd [pw_8]
|
|
%endif
|
|
%endif
|
|
|
|
.x_other_y_zero_loop:
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m4, [srcq+1]
|
|
mova m1, [dstq]
|
|
%if cpuflag(ssse3)
|
|
punpckhbw m2, m0, m4
|
|
punpcklbw m0, m4
|
|
pmaddubsw m2, filter_x_a
|
|
pmaddubsw m0, filter_x_a
|
|
paddw m2, filter_rnd
|
|
paddw m0, filter_rnd
|
|
%else
|
|
punpckhbw m2, m0, m5
|
|
punpckhbw m3, m4, m5
|
|
punpcklbw m0, m5
|
|
punpcklbw m4, m5
|
|
pmullw m2, filter_x_a
|
|
pmullw m3, filter_x_b
|
|
paddw m2, filter_rnd
|
|
pmullw m0, filter_x_a
|
|
pmullw m4, filter_x_b
|
|
paddw m0, filter_rnd
|
|
paddw m2, m3
|
|
paddw m0, m4
|
|
%endif
|
|
psraw m2, 4
|
|
psraw m0, 4
|
|
%if %2 == 1 ; avg
|
|
; FIXME(rbultje) pipeline
|
|
packuswb m0, m2
|
|
pavgb m0, [secq]
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%endif
|
|
punpckhbw m3, m1, m5
|
|
punpcklbw m1, m5
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
add srcq, src_strideq
|
|
add dstq, dst_strideq
|
|
%else ; %1 < 16
|
|
movh m0, [srcq]
|
|
movh m1, [srcq+1]
|
|
movh m2, [srcq+src_strideq]
|
|
movh m4, [srcq+src_strideq+1]
|
|
movh m3, [dstq+dst_strideq]
|
|
%if cpuflag(ssse3)
|
|
punpcklbw m0, m1
|
|
movh m1, [dstq]
|
|
punpcklbw m2, m4
|
|
pmaddubsw m0, filter_x_a
|
|
pmaddubsw m2, filter_x_a
|
|
punpcklbw m3, m5
|
|
paddw m0, filter_rnd
|
|
paddw m2, filter_rnd
|
|
%else
|
|
punpcklbw m0, m5
|
|
punpcklbw m1, m5
|
|
punpcklbw m2, m5
|
|
punpcklbw m4, m5
|
|
pmullw m0, filter_x_a
|
|
pmullw m1, filter_x_b
|
|
punpcklbw m3, m5
|
|
paddw m0, filter_rnd
|
|
pmullw m2, filter_x_a
|
|
pmullw m4, filter_x_b
|
|
paddw m0, m1
|
|
paddw m2, filter_rnd
|
|
movh m1, [dstq]
|
|
paddw m2, m4
|
|
%endif
|
|
psraw m0, 4
|
|
psraw m2, 4
|
|
%if %2 == 1 ; avg
|
|
; FIXME(rbultje) pipeline
|
|
packuswb m0, m2
|
|
pavgb m0, [secq]
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%endif
|
|
punpcklbw m1, m5
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
lea srcq, [srcq+src_strideq*2]
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
dec h
|
|
jg .x_other_y_zero_loop
|
|
%undef filter_x_a
|
|
%undef filter_x_b
|
|
%undef filter_rnd
|
|
STORE_AND_RET
|
|
|
|
.x_nonhalf_y_nonzero:
|
|
cmp y_offsetd, 8
|
|
jne .x_nonhalf_y_nonhalf
|
|
|
|
; x_offset == bilin interpolation && y_offset == 0.5
|
|
%ifdef PIC
|
|
lea bilin_filter, [bilin_filter_m]
|
|
%endif
|
|
shl x_offsetd, filter_idx_shift
|
|
%if ARCH_X86_64 && mmsize == 16
|
|
mova m8, [bilin_filter+x_offsetq]
|
|
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
|
|
mova m9, [bilin_filter+x_offsetq+16]
|
|
%endif
|
|
mova m10, [pw_8]
|
|
%define filter_x_a m8
|
|
%define filter_x_b m9
|
|
%define filter_rnd m10
|
|
%else ; x86-32
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
; y_offset == 0.5. We can reuse y_offset reg.
|
|
%define tempq y_offsetq
|
|
add x_offsetq, g_bilin_filterm
|
|
%define filter_x_a [x_offsetq]
|
|
%define filter_x_b [x_offsetq+16]
|
|
mov tempq, g_pw_8m
|
|
%define filter_rnd [tempq]
|
|
%else
|
|
add x_offsetq, bilin_filter
|
|
%define filter_x_a [x_offsetq]
|
|
%define filter_x_b [x_offsetq+16]
|
|
%define filter_rnd [pw_8]
|
|
%endif
|
|
%endif
|
|
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m1, [srcq+1]
|
|
%if cpuflag(ssse3)
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
pmaddubsw m2, filter_x_a
|
|
pmaddubsw m0, filter_x_a
|
|
paddw m2, filter_rnd
|
|
paddw m0, filter_rnd
|
|
%else
|
|
punpckhbw m2, m0, m5
|
|
punpckhbw m3, m1, m5
|
|
punpcklbw m0, m5
|
|
punpcklbw m1, m5
|
|
pmullw m0, filter_x_a
|
|
pmullw m1, filter_x_b
|
|
paddw m0, filter_rnd
|
|
pmullw m2, filter_x_a
|
|
pmullw m3, filter_x_b
|
|
paddw m2, filter_rnd
|
|
paddw m0, m1
|
|
paddw m2, m3
|
|
%endif
|
|
psraw m0, 4
|
|
psraw m2, 4
|
|
add srcq, src_strideq
|
|
packuswb m0, m2
|
|
.x_other_y_half_loop:
|
|
movu m4, [srcq]
|
|
movu m3, [srcq+1]
|
|
%if cpuflag(ssse3)
|
|
mova m1, [dstq]
|
|
punpckhbw m2, m4, m3
|
|
punpcklbw m4, m3
|
|
pmaddubsw m2, filter_x_a
|
|
pmaddubsw m4, filter_x_a
|
|
paddw m2, filter_rnd
|
|
paddw m4, filter_rnd
|
|
psraw m2, 4
|
|
psraw m4, 4
|
|
packuswb m4, m2
|
|
pavgb m0, m4
|
|
punpckhbw m3, m1, m5
|
|
punpcklbw m1, m5
|
|
%else
|
|
punpckhbw m2, m4, m5
|
|
punpckhbw m1, m3, m5
|
|
punpcklbw m4, m5
|
|
punpcklbw m3, m5
|
|
pmullw m4, filter_x_a
|
|
pmullw m3, filter_x_b
|
|
paddw m4, filter_rnd
|
|
pmullw m2, filter_x_a
|
|
pmullw m1, filter_x_b
|
|
paddw m2, filter_rnd
|
|
paddw m4, m3
|
|
paddw m2, m1
|
|
mova m1, [dstq]
|
|
psraw m4, 4
|
|
psraw m2, 4
|
|
punpckhbw m3, m1, m5
|
|
; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
|
|
; have a 1-register shortage to be able to store the backup of the bilin
|
|
; filtered second line as words as cache for the next line. Packing into
|
|
; a byte costs 1 pack and 2 unpacks, but saves a register.
|
|
packuswb m4, m2
|
|
punpcklbw m1, m5
|
|
pavgb m0, m4
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
; FIXME(rbultje) pipeline
|
|
pavgb m0, [secq]
|
|
%endif
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
mova m0, m4
|
|
|
|
add srcq, src_strideq
|
|
add dstq, dst_strideq
|
|
%else ; %1 < 16
|
|
movh m0, [srcq]
|
|
movh m1, [srcq+1]
|
|
%if cpuflag(ssse3)
|
|
punpcklbw m0, m1
|
|
pmaddubsw m0, filter_x_a
|
|
paddw m0, filter_rnd
|
|
%else
|
|
punpcklbw m0, m5
|
|
punpcklbw m1, m5
|
|
pmullw m0, filter_x_a
|
|
pmullw m1, filter_x_b
|
|
paddw m0, filter_rnd
|
|
paddw m0, m1
|
|
%endif
|
|
add srcq, src_strideq
|
|
psraw m0, 4
|
|
.x_other_y_half_loop:
|
|
movh m2, [srcq]
|
|
movh m1, [srcq+1]
|
|
movh m4, [srcq+src_strideq]
|
|
movh m3, [srcq+src_strideq+1]
|
|
%if cpuflag(ssse3)
|
|
punpcklbw m2, m1
|
|
punpcklbw m4, m3
|
|
pmaddubsw m2, filter_x_a
|
|
pmaddubsw m4, filter_x_a
|
|
movh m1, [dstq]
|
|
movh m3, [dstq+dst_strideq]
|
|
paddw m2, filter_rnd
|
|
paddw m4, filter_rnd
|
|
%else
|
|
punpcklbw m2, m5
|
|
punpcklbw m1, m5
|
|
punpcklbw m4, m5
|
|
punpcklbw m3, m5
|
|
pmullw m2, filter_x_a
|
|
pmullw m1, filter_x_b
|
|
paddw m2, filter_rnd
|
|
pmullw m4, filter_x_a
|
|
pmullw m3, filter_x_b
|
|
paddw m4, filter_rnd
|
|
paddw m2, m1
|
|
movh m1, [dstq]
|
|
paddw m4, m3
|
|
movh m3, [dstq+dst_strideq]
|
|
%endif
|
|
psraw m2, 4
|
|
psraw m4, 4
|
|
pavgw m0, m2
|
|
pavgw m2, m4
|
|
%if %2 == 1 ; avg
|
|
; FIXME(rbultje) pipeline - also consider going to bytes here
|
|
packuswb m0, m2
|
|
pavgb m0, [secq]
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%endif
|
|
punpcklbw m3, m5
|
|
punpcklbw m1, m5
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
mova m0, m4
|
|
|
|
lea srcq, [srcq+src_strideq*2]
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
dec h
|
|
jg .x_other_y_half_loop
|
|
%undef filter_x_a
|
|
%undef filter_x_b
|
|
%undef filter_rnd
|
|
STORE_AND_RET
|
|
|
|
.x_nonhalf_y_nonhalf:
|
|
%ifdef PIC
|
|
lea bilin_filter, [bilin_filter_m]
|
|
%endif
|
|
shl x_offsetd, filter_idx_shift
|
|
shl y_offsetd, filter_idx_shift
|
|
%if ARCH_X86_64 && mmsize == 16
|
|
mova m8, [bilin_filter+x_offsetq]
|
|
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
|
|
mova m9, [bilin_filter+x_offsetq+16]
|
|
%endif
|
|
mova m10, [bilin_filter+y_offsetq]
|
|
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
|
|
mova m11, [bilin_filter+y_offsetq+16]
|
|
%endif
|
|
mova m12, [pw_8]
|
|
%define filter_x_a m8
|
|
%define filter_x_b m9
|
|
%define filter_y_a m10
|
|
%define filter_y_b m11
|
|
%define filter_rnd m12
|
|
%else ; x86-32
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
; In this case, there is NO unused register. Used src_stride register. Later,
|
|
; src_stride has to be loaded from stack when it is needed.
|
|
%define tempq src_strideq
|
|
mov tempq, g_bilin_filterm
|
|
add x_offsetq, tempq
|
|
add y_offsetq, tempq
|
|
%define filter_x_a [x_offsetq]
|
|
%define filter_x_b [x_offsetq+16]
|
|
%define filter_y_a [y_offsetq]
|
|
%define filter_y_b [y_offsetq+16]
|
|
|
|
mov tempq, g_pw_8m
|
|
%define filter_rnd [tempq]
|
|
%else
|
|
add x_offsetq, bilin_filter
|
|
add y_offsetq, bilin_filter
|
|
%define filter_x_a [x_offsetq]
|
|
%define filter_x_b [x_offsetq+16]
|
|
%define filter_y_a [y_offsetq]
|
|
%define filter_y_b [y_offsetq+16]
|
|
%define filter_rnd [pw_8]
|
|
%endif
|
|
%endif
|
|
|
|
; x_offset == bilin interpolation && y_offset == bilin interpolation
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m1, [srcq+1]
|
|
%if cpuflag(ssse3)
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
pmaddubsw m2, filter_x_a
|
|
pmaddubsw m0, filter_x_a
|
|
paddw m2, filter_rnd
|
|
paddw m0, filter_rnd
|
|
%else
|
|
punpckhbw m2, m0, m5
|
|
punpckhbw m3, m1, m5
|
|
punpcklbw m0, m5
|
|
punpcklbw m1, m5
|
|
pmullw m0, filter_x_a
|
|
pmullw m1, filter_x_b
|
|
paddw m0, filter_rnd
|
|
pmullw m2, filter_x_a
|
|
pmullw m3, filter_x_b
|
|
paddw m2, filter_rnd
|
|
paddw m0, m1
|
|
paddw m2, m3
|
|
%endif
|
|
psraw m0, 4
|
|
psraw m2, 4
|
|
|
|
INC_SRC_BY_SRC_STRIDE
|
|
|
|
packuswb m0, m2
|
|
.x_other_y_other_loop:
|
|
%if cpuflag(ssse3)
|
|
movu m4, [srcq]
|
|
movu m3, [srcq+1]
|
|
mova m1, [dstq]
|
|
punpckhbw m2, m4, m3
|
|
punpcklbw m4, m3
|
|
pmaddubsw m2, filter_x_a
|
|
pmaddubsw m4, filter_x_a
|
|
punpckhbw m3, m1, m5
|
|
paddw m2, filter_rnd
|
|
paddw m4, filter_rnd
|
|
psraw m2, 4
|
|
psraw m4, 4
|
|
packuswb m4, m2
|
|
punpckhbw m2, m0, m4
|
|
punpcklbw m0, m4
|
|
pmaddubsw m2, filter_y_a
|
|
pmaddubsw m0, filter_y_a
|
|
punpcklbw m1, m5
|
|
paddw m2, filter_rnd
|
|
paddw m0, filter_rnd
|
|
psraw m2, 4
|
|
psraw m0, 4
|
|
%else
|
|
movu m3, [srcq]
|
|
movu m4, [srcq+1]
|
|
punpckhbw m1, m3, m5
|
|
punpckhbw m2, m4, m5
|
|
punpcklbw m3, m5
|
|
punpcklbw m4, m5
|
|
pmullw m3, filter_x_a
|
|
pmullw m4, filter_x_b
|
|
paddw m3, filter_rnd
|
|
pmullw m1, filter_x_a
|
|
pmullw m2, filter_x_b
|
|
paddw m1, filter_rnd
|
|
paddw m3, m4
|
|
paddw m1, m2
|
|
psraw m3, 4
|
|
psraw m1, 4
|
|
packuswb m4, m3, m1
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
pmullw m2, filter_y_a
|
|
pmullw m1, filter_y_b
|
|
paddw m2, filter_rnd
|
|
pmullw m0, filter_y_a
|
|
pmullw m3, filter_y_b
|
|
paddw m2, m1
|
|
mova m1, [dstq]
|
|
paddw m0, filter_rnd
|
|
psraw m2, 4
|
|
paddw m0, m3
|
|
punpckhbw m3, m1, m5
|
|
psraw m0, 4
|
|
punpcklbw m1, m5
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
; FIXME(rbultje) pipeline
|
|
packuswb m0, m2
|
|
pavgb m0, [secq]
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%endif
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
mova m0, m4
|
|
|
|
INC_SRC_BY_SRC_STRIDE
|
|
add dstq, dst_strideq
|
|
%else ; %1 < 16
|
|
movh m0, [srcq]
|
|
movh m1, [srcq+1]
|
|
%if cpuflag(ssse3)
|
|
punpcklbw m0, m1
|
|
pmaddubsw m0, filter_x_a
|
|
paddw m0, filter_rnd
|
|
%else
|
|
punpcklbw m0, m5
|
|
punpcklbw m1, m5
|
|
pmullw m0, filter_x_a
|
|
pmullw m1, filter_x_b
|
|
paddw m0, filter_rnd
|
|
paddw m0, m1
|
|
%endif
|
|
psraw m0, 4
|
|
%if cpuflag(ssse3)
|
|
packuswb m0, m0
|
|
%endif
|
|
|
|
INC_SRC_BY_SRC_STRIDE
|
|
|
|
.x_other_y_other_loop:
|
|
movh m2, [srcq]
|
|
movh m1, [srcq+1]
|
|
|
|
INC_SRC_BY_SRC_STRIDE
|
|
movh m4, [srcq]
|
|
movh m3, [srcq+1]
|
|
|
|
%if cpuflag(ssse3)
|
|
punpcklbw m2, m1
|
|
punpcklbw m4, m3
|
|
pmaddubsw m2, filter_x_a
|
|
pmaddubsw m4, filter_x_a
|
|
movh m3, [dstq+dst_strideq]
|
|
movh m1, [dstq]
|
|
paddw m2, filter_rnd
|
|
paddw m4, filter_rnd
|
|
psraw m2, 4
|
|
psraw m4, 4
|
|
packuswb m2, m2
|
|
packuswb m4, m4
|
|
punpcklbw m0, m2
|
|
punpcklbw m2, m4
|
|
pmaddubsw m0, filter_y_a
|
|
pmaddubsw m2, filter_y_a
|
|
punpcklbw m3, m5
|
|
paddw m0, filter_rnd
|
|
paddw m2, filter_rnd
|
|
psraw m0, 4
|
|
psraw m2, 4
|
|
punpcklbw m1, m5
|
|
%else
|
|
punpcklbw m2, m5
|
|
punpcklbw m1, m5
|
|
punpcklbw m4, m5
|
|
punpcklbw m3, m5
|
|
pmullw m2, filter_x_a
|
|
pmullw m1, filter_x_b
|
|
paddw m2, filter_rnd
|
|
pmullw m4, filter_x_a
|
|
pmullw m3, filter_x_b
|
|
paddw m4, filter_rnd
|
|
paddw m2, m1
|
|
paddw m4, m3
|
|
psraw m2, 4
|
|
psraw m4, 4
|
|
pmullw m0, filter_y_a
|
|
pmullw m3, m2, filter_y_b
|
|
paddw m0, filter_rnd
|
|
pmullw m2, filter_y_a
|
|
pmullw m1, m4, filter_y_b
|
|
paddw m2, filter_rnd
|
|
paddw m0, m3
|
|
movh m3, [dstq+dst_strideq]
|
|
paddw m2, m1
|
|
movh m1, [dstq]
|
|
psraw m0, 4
|
|
psraw m2, 4
|
|
punpcklbw m3, m5
|
|
punpcklbw m1, m5
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
; FIXME(rbultje) pipeline
|
|
packuswb m0, m2
|
|
pavgb m0, [secq]
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%endif
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
mova m0, m4
|
|
|
|
INC_SRC_BY_SRC_STRIDE
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
dec h
|
|
jg .x_other_y_other_loop
|
|
%undef filter_x_a
|
|
%undef filter_x_b
|
|
%undef filter_y_a
|
|
%undef filter_y_b
|
|
%undef filter_rnd
|
|
STORE_AND_RET
|
|
%endmacro
|
|
|
|
; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
|
|
; between the ssse3 and non-ssse3 version. It may make sense to merge their
|
|
; code in the sense that the ssse3 version would jump to the appropriate
|
|
; location in the sse/2 version, rather than duplicating that code in the
|
|
; binary.
|
|
|
|
INIT_MMX sse
|
|
SUBPEL_VARIANCE 4
|
|
INIT_XMM sse2
|
|
SUBPEL_VARIANCE 8
|
|
SUBPEL_VARIANCE 16
|
|
|
|
INIT_MMX ssse3
|
|
SUBPEL_VARIANCE 4
|
|
INIT_XMM ssse3
|
|
SUBPEL_VARIANCE 8
|
|
SUBPEL_VARIANCE 16
|
|
|
|
INIT_MMX sse
|
|
SUBPEL_VARIANCE 4, 1
|
|
INIT_XMM sse2
|
|
SUBPEL_VARIANCE 8, 1
|
|
SUBPEL_VARIANCE 16, 1
|
|
|
|
INIT_MMX ssse3
|
|
SUBPEL_VARIANCE 4, 1
|
|
INIT_XMM ssse3
|
|
SUBPEL_VARIANCE 8, 1
|
|
SUBPEL_VARIANCE 16, 1
|