eb88b172fe
The only difference between the two was that the vp9 function allowed for every step in the bilinear filter (16 steps) while vp8 only allowed for half of those. Since all the call sites in vp9 (<< 1) the input, it only ever used the same steps as vp8. This will allow moving the subpel variance to vpx_dsp with the rest of the variance functions. Change-Id: I6fa2509350a2dc610c46b3e15bde98a15a084b75
1040 lines
31 KiB
NASM
1040 lines
31 KiB
NASM
;
|
|
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
|
;
|
|
; Use of this source code is governed by a BSD-style license
|
|
; that can be found in the LICENSE file in the root of the source
|
|
; tree. An additional intellectual property rights grant can be found
|
|
; in the file PATENTS. All contributing project authors may
|
|
; be found in the AUTHORS file in the root of the source tree.
|
|
;
|
|
|
|
%include "third_party/x86inc/x86inc.asm"
|
|
|
|
SECTION_RODATA
|
|
pw_8: times 8 dw 8
|
|
bilin_filter_m_sse2: times 8 dw 16
|
|
times 8 dw 0
|
|
times 8 dw 14
|
|
times 8 dw 2
|
|
times 8 dw 12
|
|
times 8 dw 4
|
|
times 8 dw 10
|
|
times 8 dw 6
|
|
times 16 dw 8
|
|
times 8 dw 6
|
|
times 8 dw 10
|
|
times 8 dw 4
|
|
times 8 dw 12
|
|
times 8 dw 2
|
|
times 8 dw 14
|
|
|
|
SECTION .text
|
|
|
|
; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
|
|
; int x_offset, int y_offset,
|
|
; const uint8_t *dst, ptrdiff_t dst_stride,
|
|
; int height, unsigned int *sse);
|
|
;
|
|
; This function returns the SE and stores SSE in the given pointer.
|
|
|
|
%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
|
|
psubw %3, %4
|
|
psubw %1, %2
|
|
mova %4, %3 ; make copies to manipulate to calc sum
|
|
mova %2, %1 ; use originals for calc sse
|
|
pmaddwd %3, %3
|
|
paddw %4, %2
|
|
pmaddwd %1, %1
|
|
movhlps %2, %4
|
|
paddd %6, %3
|
|
paddw %4, %2
|
|
pxor %2, %2
|
|
pcmpgtw %2, %4 ; mask for 0 > %4 (sum)
|
|
punpcklwd %4, %2 ; sign-extend word to dword
|
|
paddd %6, %1
|
|
paddd %5, %4
|
|
|
|
%endmacro
|
|
|
|
%macro STORE_AND_RET 0
|
|
%if mmsize == 16
|
|
; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
|
|
; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
|
|
; We have to sign-extend it before adding the words within the register
|
|
; and outputing to a dword.
|
|
movhlps m3, m7
|
|
movhlps m4, m6
|
|
paddd m7, m3
|
|
paddd m6, m4
|
|
pshufd m3, m7, 0x1
|
|
pshufd m4, m6, 0x1
|
|
paddd m7, m3
|
|
paddd m6, m4
|
|
mov r1, ssem ; r1 = unsigned int *sse
|
|
movd [r1], m7 ; store sse
|
|
movd rax, m6 ; store sum as return value
|
|
%endif
|
|
RET
|
|
%endmacro
|
|
|
|
%macro INC_SRC_BY_SRC_STRIDE 0
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
lea srcq, [srcq + src_stridemp*2]
|
|
%else
|
|
lea srcq, [srcq + src_strideq*2]
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro INC_SRC_BY_SRC_2STRIDE 0
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
lea srcq, [srcq + src_stridemp*4]
|
|
%else
|
|
lea srcq, [srcq + src_strideq*4]
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro SUBPEL_VARIANCE 1-2 0 ; W
|
|
%define bilin_filter_m bilin_filter_m_sse2
|
|
%define filter_idx_shift 5
|
|
|
|
|
|
%ifdef PIC ; 64bit PIC
|
|
%if %2 == 1 ; avg
|
|
cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
|
|
x_offset, y_offset, \
|
|
dst, dst_stride, \
|
|
sec, sec_stride, height, sse
|
|
%define sec_str sec_strideq
|
|
%else
|
|
cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
|
|
y_offset, dst, dst_stride, height, sse
|
|
%endif
|
|
%define h heightd
|
|
%define bilin_filter sseq
|
|
%else
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
%if %2 == 1 ; avg
|
|
cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
|
|
x_offset, y_offset, \
|
|
dst, dst_stride, \
|
|
sec, sec_stride, \
|
|
height, sse, g_bilin_filter, g_pw_8
|
|
%define h dword heightm
|
|
%define sec_str sec_stridemp
|
|
|
|
; Store bilin_filter and pw_8 location in stack
|
|
GET_GOT eax
|
|
add esp, 4 ; restore esp
|
|
|
|
lea ecx, [GLOBAL(bilin_filter_m)]
|
|
mov g_bilin_filterm, ecx
|
|
|
|
lea ecx, [GLOBAL(pw_8)]
|
|
mov g_pw_8m, ecx
|
|
|
|
LOAD_IF_USED 0, 1 ; load eax, ecx back
|
|
%else
|
|
cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
|
|
x_offset, y_offset, dst, dst_stride, height, \
|
|
sse, g_bilin_filter, g_pw_8
|
|
%define h heightd
|
|
|
|
; Store bilin_filter and pw_8 location in stack
|
|
GET_GOT eax
|
|
add esp, 4 ; restore esp
|
|
|
|
lea ecx, [GLOBAL(bilin_filter_m)]
|
|
mov g_bilin_filterm, ecx
|
|
|
|
lea ecx, [GLOBAL(pw_8)]
|
|
mov g_pw_8m, ecx
|
|
|
|
LOAD_IF_USED 0, 1 ; load eax, ecx back
|
|
%endif
|
|
%else
|
|
%if %2 == 1 ; avg
|
|
cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
|
|
7 + 2 * ARCH_X86_64, 13, src, src_stride, \
|
|
x_offset, y_offset, \
|
|
dst, dst_stride, \
|
|
sec, sec_stride, \
|
|
height, sse
|
|
%if ARCH_X86_64
|
|
%define h heightd
|
|
%define sec_str sec_strideq
|
|
%else
|
|
%define h dword heightm
|
|
%define sec_str sec_stridemp
|
|
%endif
|
|
%else
|
|
cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
|
|
x_offset, y_offset, dst, dst_stride, height, sse
|
|
%define h heightd
|
|
%endif
|
|
|
|
%define bilin_filter bilin_filter_m
|
|
%endif
|
|
%endif
|
|
|
|
ASSERT %1 <= 16 ; m6 overflows if w > 16
|
|
pxor m6, m6 ; sum
|
|
pxor m7, m7 ; sse
|
|
|
|
%if %1 < 16
|
|
sar h, 1
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
shl sec_str, 1
|
|
%endif
|
|
|
|
; FIXME(rbultje) replace by jumptable?
|
|
test x_offsetd, x_offsetd
|
|
jnz .x_nonzero
|
|
; x_offset == 0
|
|
test y_offsetd, y_offsetd
|
|
jnz .x_zero_y_nonzero
|
|
|
|
; x_offset == 0 && y_offset == 0
|
|
.x_zero_y_zero_loop:
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m2, [srcq + 16]
|
|
mova m1, [dstq]
|
|
mova m3, [dstq + 16]
|
|
%if %2 == 1 ; avg
|
|
pavgw m0, [secq]
|
|
pavgw m2, [secq+16]
|
|
%endif
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
lea srcq, [srcq + src_strideq*2]
|
|
lea dstq, [dstq + dst_strideq*2]
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
%else ; %1 < 16
|
|
movu m0, [srcq]
|
|
movu m2, [srcq + src_strideq*2]
|
|
mova m1, [dstq]
|
|
mova m3, [dstq + dst_strideq*2]
|
|
%if %2 == 1 ; avg
|
|
pavgw m0, [secq]
|
|
add secq, sec_str
|
|
pavgw m2, [secq]
|
|
%endif
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
lea srcq, [srcq + src_strideq*4]
|
|
lea dstq, [dstq + dst_strideq*4]
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
%endif
|
|
dec h
|
|
jg .x_zero_y_zero_loop
|
|
STORE_AND_RET
|
|
|
|
.x_zero_y_nonzero:
|
|
cmp y_offsetd, 8
|
|
jne .x_zero_y_nonhalf
|
|
|
|
; x_offset == 0 && y_offset == 0.5
|
|
.x_zero_y_half_loop:
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m1, [srcq+16]
|
|
movu m4, [srcq+src_strideq*2]
|
|
movu m5, [srcq+src_strideq*2+16]
|
|
mova m2, [dstq]
|
|
mova m3, [dstq+16]
|
|
pavgw m0, m4
|
|
pavgw m1, m5
|
|
%if %2 == 1 ; avg
|
|
pavgw m0, [secq]
|
|
pavgw m1, [secq+16]
|
|
%endif
|
|
SUM_SSE m0, m2, m1, m3, m6, m7
|
|
|
|
lea srcq, [srcq + src_strideq*2]
|
|
lea dstq, [dstq + dst_strideq*2]
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
%else ; %1 < 16
|
|
movu m0, [srcq]
|
|
movu m1, [srcq+src_strideq*2]
|
|
movu m5, [srcq+src_strideq*4]
|
|
mova m2, [dstq]
|
|
mova m3, [dstq+dst_strideq*2]
|
|
pavgw m0, m1
|
|
pavgw m1, m5
|
|
%if %2 == 1 ; avg
|
|
pavgw m0, [secq]
|
|
add secq, sec_str
|
|
pavgw m1, [secq]
|
|
%endif
|
|
SUM_SSE m0, m2, m1, m3, m6, m7
|
|
|
|
lea srcq, [srcq + src_strideq*4]
|
|
lea dstq, [dstq + dst_strideq*4]
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
%endif
|
|
dec h
|
|
jg .x_zero_y_half_loop
|
|
STORE_AND_RET
|
|
|
|
.x_zero_y_nonhalf:
|
|
; x_offset == 0 && y_offset == bilin interpolation
|
|
%ifdef PIC
|
|
lea bilin_filter, [bilin_filter_m]
|
|
%endif
|
|
shl y_offsetd, filter_idx_shift
|
|
%if ARCH_X86_64 && mmsize == 16
|
|
mova m8, [bilin_filter+y_offsetq]
|
|
mova m9, [bilin_filter+y_offsetq+16]
|
|
mova m10, [pw_8]
|
|
%define filter_y_a m8
|
|
%define filter_y_b m9
|
|
%define filter_rnd m10
|
|
%else ; x86-32 or mmx
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
; x_offset == 0, reuse x_offset reg
|
|
%define tempq x_offsetq
|
|
add y_offsetq, g_bilin_filterm
|
|
%define filter_y_a [y_offsetq]
|
|
%define filter_y_b [y_offsetq+16]
|
|
mov tempq, g_pw_8m
|
|
%define filter_rnd [tempq]
|
|
%else
|
|
add y_offsetq, bilin_filter
|
|
%define filter_y_a [y_offsetq]
|
|
%define filter_y_b [y_offsetq+16]
|
|
%define filter_rnd [pw_8]
|
|
%endif
|
|
%endif
|
|
|
|
.x_zero_y_other_loop:
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m1, [srcq + 16]
|
|
movu m4, [srcq+src_strideq*2]
|
|
movu m5, [srcq+src_strideq*2+16]
|
|
mova m2, [dstq]
|
|
mova m3, [dstq+16]
|
|
; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
|
|
; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
|
|
; instructions is the same (5), but it is 1 mul instead of 2, so might be
|
|
; slightly faster because of pmullw latency. It would also cut our rodata
|
|
; tables in half for this function, and save 1-2 registers on x86-64.
|
|
pmullw m1, filter_y_a
|
|
pmullw m5, filter_y_b
|
|
paddw m1, filter_rnd
|
|
pmullw m0, filter_y_a
|
|
pmullw m4, filter_y_b
|
|
paddw m0, filter_rnd
|
|
paddw m1, m5
|
|
paddw m0, m4
|
|
psrlw m1, 4
|
|
psrlw m0, 4
|
|
%if %2 == 1 ; avg
|
|
pavgw m0, [secq]
|
|
pavgw m1, [secq+16]
|
|
%endif
|
|
SUM_SSE m0, m2, m1, m3, m6, m7
|
|
|
|
lea srcq, [srcq + src_strideq*2]
|
|
lea dstq, [dstq + dst_strideq*2]
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
%else ; %1 < 16
|
|
movu m0, [srcq]
|
|
movu m1, [srcq+src_strideq*2]
|
|
movu m5, [srcq+src_strideq*4]
|
|
mova m4, m1
|
|
mova m2, [dstq]
|
|
mova m3, [dstq+dst_strideq*2]
|
|
pmullw m1, filter_y_a
|
|
pmullw m5, filter_y_b
|
|
paddw m1, filter_rnd
|
|
pmullw m0, filter_y_a
|
|
pmullw m4, filter_y_b
|
|
paddw m0, filter_rnd
|
|
paddw m1, m5
|
|
paddw m0, m4
|
|
psrlw m1, 4
|
|
psrlw m0, 4
|
|
%if %2 == 1 ; avg
|
|
pavgw m0, [secq]
|
|
add secq, sec_str
|
|
pavgw m1, [secq]
|
|
%endif
|
|
SUM_SSE m0, m2, m1, m3, m6, m7
|
|
|
|
lea srcq, [srcq + src_strideq*4]
|
|
lea dstq, [dstq + dst_strideq*4]
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
%endif
|
|
dec h
|
|
jg .x_zero_y_other_loop
|
|
%undef filter_y_a
|
|
%undef filter_y_b
|
|
%undef filter_rnd
|
|
STORE_AND_RET
|
|
|
|
.x_nonzero:
|
|
cmp x_offsetd, 8
|
|
jne .x_nonhalf
|
|
; x_offset == 0.5
|
|
test y_offsetd, y_offsetd
|
|
jnz .x_half_y_nonzero
|
|
|
|
; x_offset == 0.5 && y_offset == 0
|
|
.x_half_y_zero_loop:
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m1, [srcq + 16]
|
|
movu m4, [srcq + 2]
|
|
movu m5, [srcq + 18]
|
|
mova m2, [dstq]
|
|
mova m3, [dstq + 16]
|
|
pavgw m0, m4
|
|
pavgw m1, m5
|
|
%if %2 == 1 ; avg
|
|
pavgw m0, [secq]
|
|
pavgw m1, [secq+16]
|
|
%endif
|
|
SUM_SSE m0, m2, m1, m3, m6, m7
|
|
|
|
lea srcq, [srcq + src_strideq*2]
|
|
lea dstq, [dstq + dst_strideq*2]
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
%else ; %1 < 16
|
|
movu m0, [srcq]
|
|
movu m1, [srcq + src_strideq*2]
|
|
movu m4, [srcq + 2]
|
|
movu m5, [srcq + src_strideq*2 + 2]
|
|
mova m2, [dstq]
|
|
mova m3, [dstq + dst_strideq*2]
|
|
pavgw m0, m4
|
|
pavgw m1, m5
|
|
%if %2 == 1 ; avg
|
|
pavgw m0, [secq]
|
|
add secq, sec_str
|
|
pavgw m1, [secq]
|
|
%endif
|
|
SUM_SSE m0, m2, m1, m3, m6, m7
|
|
|
|
lea srcq, [srcq + src_strideq*4]
|
|
lea dstq, [dstq + dst_strideq*4]
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
%endif
|
|
dec h
|
|
jg .x_half_y_zero_loop
|
|
STORE_AND_RET
|
|
|
|
.x_half_y_nonzero:
|
|
cmp y_offsetd, 8
|
|
jne .x_half_y_nonhalf
|
|
|
|
; x_offset == 0.5 && y_offset == 0.5
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m1, [srcq+16]
|
|
movu m2, [srcq+2]
|
|
movu m3, [srcq+18]
|
|
lea srcq, [srcq + src_strideq*2]
|
|
pavgw m0, m2
|
|
pavgw m1, m3
|
|
.x_half_y_half_loop:
|
|
movu m2, [srcq]
|
|
movu m3, [srcq + 16]
|
|
movu m4, [srcq + 2]
|
|
movu m5, [srcq + 18]
|
|
pavgw m2, m4
|
|
pavgw m3, m5
|
|
pavgw m0, m2
|
|
pavgw m1, m3
|
|
mova m4, [dstq]
|
|
mova m5, [dstq + 16]
|
|
%if %2 == 1 ; avg
|
|
pavgw m0, [secq]
|
|
pavgw m1, [secq+16]
|
|
%endif
|
|
SUM_SSE m0, m4, m1, m5, m6, m7
|
|
mova m0, m2
|
|
mova m1, m3
|
|
|
|
lea srcq, [srcq + src_strideq*2]
|
|
lea dstq, [dstq + dst_strideq*2]
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
%else ; %1 < 16
|
|
movu m0, [srcq]
|
|
movu m2, [srcq+2]
|
|
lea srcq, [srcq + src_strideq*2]
|
|
pavgw m0, m2
|
|
.x_half_y_half_loop:
|
|
movu m2, [srcq]
|
|
movu m3, [srcq + src_strideq*2]
|
|
movu m4, [srcq + 2]
|
|
movu m5, [srcq + src_strideq*2 + 2]
|
|
pavgw m2, m4
|
|
pavgw m3, m5
|
|
pavgw m0, m2
|
|
pavgw m2, m3
|
|
mova m4, [dstq]
|
|
mova m5, [dstq + dst_strideq*2]
|
|
%if %2 == 1 ; avg
|
|
pavgw m0, [secq]
|
|
add secq, sec_str
|
|
pavgw m2, [secq]
|
|
%endif
|
|
SUM_SSE m0, m4, m2, m5, m6, m7
|
|
mova m0, m3
|
|
|
|
lea srcq, [srcq + src_strideq*4]
|
|
lea dstq, [dstq + dst_strideq*4]
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
%endif
|
|
dec h
|
|
jg .x_half_y_half_loop
|
|
STORE_AND_RET
|
|
|
|
.x_half_y_nonhalf:
|
|
; x_offset == 0.5 && y_offset == bilin interpolation
|
|
%ifdef PIC
|
|
lea bilin_filter, [bilin_filter_m]
|
|
%endif
|
|
shl y_offsetd, filter_idx_shift
|
|
%if ARCH_X86_64 && mmsize == 16
|
|
mova m8, [bilin_filter+y_offsetq]
|
|
mova m9, [bilin_filter+y_offsetq+16]
|
|
mova m10, [pw_8]
|
|
%define filter_y_a m8
|
|
%define filter_y_b m9
|
|
%define filter_rnd m10
|
|
%else ; x86_32
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
; x_offset == 0.5. We can reuse x_offset reg
|
|
%define tempq x_offsetq
|
|
add y_offsetq, g_bilin_filterm
|
|
%define filter_y_a [y_offsetq]
|
|
%define filter_y_b [y_offsetq+16]
|
|
mov tempq, g_pw_8m
|
|
%define filter_rnd [tempq]
|
|
%else
|
|
add y_offsetq, bilin_filter
|
|
%define filter_y_a [y_offsetq]
|
|
%define filter_y_b [y_offsetq+16]
|
|
%define filter_rnd [pw_8]
|
|
%endif
|
|
%endif
|
|
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m1, [srcq+16]
|
|
movu m2, [srcq+2]
|
|
movu m3, [srcq+18]
|
|
lea srcq, [srcq + src_strideq*2]
|
|
pavgw m0, m2
|
|
pavgw m1, m3
|
|
.x_half_y_other_loop:
|
|
movu m2, [srcq]
|
|
movu m3, [srcq+16]
|
|
movu m4, [srcq+2]
|
|
movu m5, [srcq+18]
|
|
pavgw m2, m4
|
|
pavgw m3, m5
|
|
mova m4, m2
|
|
mova m5, m3
|
|
pmullw m1, filter_y_a
|
|
pmullw m3, filter_y_b
|
|
paddw m1, filter_rnd
|
|
paddw m1, m3
|
|
pmullw m0, filter_y_a
|
|
pmullw m2, filter_y_b
|
|
paddw m0, filter_rnd
|
|
psrlw m1, 4
|
|
paddw m0, m2
|
|
mova m2, [dstq]
|
|
psrlw m0, 4
|
|
mova m3, [dstq+16]
|
|
%if %2 == 1 ; avg
|
|
pavgw m0, [secq]
|
|
pavgw m1, [secq+16]
|
|
%endif
|
|
SUM_SSE m0, m2, m1, m3, m6, m7
|
|
mova m0, m4
|
|
mova m1, m5
|
|
|
|
lea srcq, [srcq + src_strideq*2]
|
|
lea dstq, [dstq + dst_strideq*2]
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
%else ; %1 < 16
|
|
movu m0, [srcq]
|
|
movu m2, [srcq+2]
|
|
lea srcq, [srcq + src_strideq*2]
|
|
pavgw m0, m2
|
|
.x_half_y_other_loop:
|
|
movu m2, [srcq]
|
|
movu m3, [srcq+src_strideq*2]
|
|
movu m4, [srcq+2]
|
|
movu m5, [srcq+src_strideq*2+2]
|
|
pavgw m2, m4
|
|
pavgw m3, m5
|
|
mova m4, m2
|
|
mova m5, m3
|
|
pmullw m4, filter_y_a
|
|
pmullw m3, filter_y_b
|
|
paddw m4, filter_rnd
|
|
paddw m4, m3
|
|
pmullw m0, filter_y_a
|
|
pmullw m2, filter_y_b
|
|
paddw m0, filter_rnd
|
|
psrlw m4, 4
|
|
paddw m0, m2
|
|
mova m2, [dstq]
|
|
psrlw m0, 4
|
|
mova m3, [dstq+dst_strideq*2]
|
|
%if %2 == 1 ; avg
|
|
pavgw m0, [secq]
|
|
add secq, sec_str
|
|
pavgw m4, [secq]
|
|
%endif
|
|
SUM_SSE m0, m2, m4, m3, m6, m7
|
|
mova m0, m5
|
|
|
|
lea srcq, [srcq + src_strideq*4]
|
|
lea dstq, [dstq + dst_strideq*4]
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
%endif
|
|
dec h
|
|
jg .x_half_y_other_loop
|
|
%undef filter_y_a
|
|
%undef filter_y_b
|
|
%undef filter_rnd
|
|
STORE_AND_RET
|
|
|
|
.x_nonhalf:
|
|
test y_offsetd, y_offsetd
|
|
jnz .x_nonhalf_y_nonzero
|
|
|
|
; x_offset == bilin interpolation && y_offset == 0
|
|
%ifdef PIC
|
|
lea bilin_filter, [bilin_filter_m]
|
|
%endif
|
|
shl x_offsetd, filter_idx_shift
|
|
%if ARCH_X86_64 && mmsize == 16
|
|
mova m8, [bilin_filter+x_offsetq]
|
|
mova m9, [bilin_filter+x_offsetq+16]
|
|
mova m10, [pw_8]
|
|
%define filter_x_a m8
|
|
%define filter_x_b m9
|
|
%define filter_rnd m10
|
|
%else ; x86-32
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
; y_offset == 0. We can reuse y_offset reg.
|
|
%define tempq y_offsetq
|
|
add x_offsetq, g_bilin_filterm
|
|
%define filter_x_a [x_offsetq]
|
|
%define filter_x_b [x_offsetq+16]
|
|
mov tempq, g_pw_8m
|
|
%define filter_rnd [tempq]
|
|
%else
|
|
add x_offsetq, bilin_filter
|
|
%define filter_x_a [x_offsetq]
|
|
%define filter_x_b [x_offsetq+16]
|
|
%define filter_rnd [pw_8]
|
|
%endif
|
|
%endif
|
|
|
|
.x_other_y_zero_loop:
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m1, [srcq+16]
|
|
movu m2, [srcq+2]
|
|
movu m3, [srcq+18]
|
|
mova m4, [dstq]
|
|
mova m5, [dstq+16]
|
|
pmullw m1, filter_x_a
|
|
pmullw m3, filter_x_b
|
|
paddw m1, filter_rnd
|
|
pmullw m0, filter_x_a
|
|
pmullw m2, filter_x_b
|
|
paddw m0, filter_rnd
|
|
paddw m1, m3
|
|
paddw m0, m2
|
|
psrlw m1, 4
|
|
psrlw m0, 4
|
|
%if %2 == 1 ; avg
|
|
pavgw m0, [secq]
|
|
pavgw m1, [secq+16]
|
|
%endif
|
|
SUM_SSE m0, m4, m1, m5, m6, m7
|
|
|
|
lea srcq, [srcq+src_strideq*2]
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
%else ; %1 < 16
|
|
movu m0, [srcq]
|
|
movu m1, [srcq+src_strideq*2]
|
|
movu m2, [srcq+2]
|
|
movu m3, [srcq+src_strideq*2+2]
|
|
mova m4, [dstq]
|
|
mova m5, [dstq+dst_strideq*2]
|
|
pmullw m1, filter_x_a
|
|
pmullw m3, filter_x_b
|
|
paddw m1, filter_rnd
|
|
pmullw m0, filter_x_a
|
|
pmullw m2, filter_x_b
|
|
paddw m0, filter_rnd
|
|
paddw m1, m3
|
|
paddw m0, m2
|
|
psrlw m1, 4
|
|
psrlw m0, 4
|
|
%if %2 == 1 ; avg
|
|
pavgw m0, [secq]
|
|
add secq, sec_str
|
|
pavgw m1, [secq]
|
|
%endif
|
|
SUM_SSE m0, m4, m1, m5, m6, m7
|
|
|
|
lea srcq, [srcq+src_strideq*4]
|
|
lea dstq, [dstq+dst_strideq*4]
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
%endif
|
|
dec h
|
|
jg .x_other_y_zero_loop
|
|
%undef filter_x_a
|
|
%undef filter_x_b
|
|
%undef filter_rnd
|
|
STORE_AND_RET
|
|
|
|
.x_nonhalf_y_nonzero:
|
|
cmp y_offsetd, 8
|
|
jne .x_nonhalf_y_nonhalf
|
|
|
|
; x_offset == bilin interpolation && y_offset == 0.5
|
|
%ifdef PIC
|
|
lea bilin_filter, [bilin_filter_m]
|
|
%endif
|
|
shl x_offsetd, filter_idx_shift
|
|
%if ARCH_X86_64 && mmsize == 16
|
|
mova m8, [bilin_filter+x_offsetq]
|
|
mova m9, [bilin_filter+x_offsetq+16]
|
|
mova m10, [pw_8]
|
|
%define filter_x_a m8
|
|
%define filter_x_b m9
|
|
%define filter_rnd m10
|
|
%else ; x86-32
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
; y_offset == 0.5. We can reuse y_offset reg.
|
|
%define tempq y_offsetq
|
|
add x_offsetq, g_bilin_filterm
|
|
%define filter_x_a [x_offsetq]
|
|
%define filter_x_b [x_offsetq+16]
|
|
mov tempq, g_pw_8m
|
|
%define filter_rnd [tempq]
|
|
%else
|
|
add x_offsetq, bilin_filter
|
|
%define filter_x_a [x_offsetq]
|
|
%define filter_x_b [x_offsetq+16]
|
|
%define filter_rnd [pw_8]
|
|
%endif
|
|
%endif
|
|
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m1, [srcq+16]
|
|
movu m2, [srcq+2]
|
|
movu m3, [srcq+18]
|
|
pmullw m0, filter_x_a
|
|
pmullw m2, filter_x_b
|
|
paddw m0, filter_rnd
|
|
pmullw m1, filter_x_a
|
|
pmullw m3, filter_x_b
|
|
paddw m1, filter_rnd
|
|
paddw m0, m2
|
|
paddw m1, m3
|
|
psrlw m0, 4
|
|
psrlw m1, 4
|
|
lea srcq, [srcq+src_strideq*2]
|
|
.x_other_y_half_loop:
|
|
movu m2, [srcq]
|
|
movu m3, [srcq+16]
|
|
movu m4, [srcq+2]
|
|
movu m5, [srcq+18]
|
|
pmullw m2, filter_x_a
|
|
pmullw m4, filter_x_b
|
|
paddw m2, filter_rnd
|
|
pmullw m3, filter_x_a
|
|
pmullw m5, filter_x_b
|
|
paddw m3, filter_rnd
|
|
paddw m2, m4
|
|
paddw m3, m5
|
|
mova m4, [dstq]
|
|
mova m5, [dstq+16]
|
|
psrlw m2, 4
|
|
psrlw m3, 4
|
|
pavgw m0, m2
|
|
pavgw m1, m3
|
|
%if %2 == 1 ; avg
|
|
pavgw m0, [secq]
|
|
pavgw m1, [secq+16]
|
|
%endif
|
|
SUM_SSE m0, m4, m1, m5, m6, m7
|
|
mova m0, m2
|
|
mova m1, m3
|
|
|
|
lea srcq, [srcq+src_strideq*2]
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
%else ; %1 < 16
|
|
movu m0, [srcq]
|
|
movu m2, [srcq+2]
|
|
pmullw m0, filter_x_a
|
|
pmullw m2, filter_x_b
|
|
paddw m0, filter_rnd
|
|
paddw m0, m2
|
|
psrlw m0, 4
|
|
lea srcq, [srcq+src_strideq*2]
|
|
.x_other_y_half_loop:
|
|
movu m2, [srcq]
|
|
movu m3, [srcq+src_strideq*2]
|
|
movu m4, [srcq+2]
|
|
movu m5, [srcq+src_strideq*2+2]
|
|
pmullw m2, filter_x_a
|
|
pmullw m4, filter_x_b
|
|
paddw m2, filter_rnd
|
|
pmullw m3, filter_x_a
|
|
pmullw m5, filter_x_b
|
|
paddw m3, filter_rnd
|
|
paddw m2, m4
|
|
paddw m3, m5
|
|
mova m4, [dstq]
|
|
mova m5, [dstq+dst_strideq*2]
|
|
psrlw m2, 4
|
|
psrlw m3, 4
|
|
pavgw m0, m2
|
|
pavgw m2, m3
|
|
%if %2 == 1 ; avg
|
|
pavgw m0, [secq]
|
|
add secq, sec_str
|
|
pavgw m2, [secq]
|
|
%endif
|
|
SUM_SSE m0, m4, m2, m5, m6, m7
|
|
mova m0, m3
|
|
|
|
lea srcq, [srcq+src_strideq*4]
|
|
lea dstq, [dstq+dst_strideq*4]
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
%endif
|
|
dec h
|
|
jg .x_other_y_half_loop
|
|
%undef filter_x_a
|
|
%undef filter_x_b
|
|
%undef filter_rnd
|
|
STORE_AND_RET
|
|
|
|
.x_nonhalf_y_nonhalf:
|
|
; loading filter - this is same as in 8-bit depth
|
|
%ifdef PIC
|
|
lea bilin_filter, [bilin_filter_m]
|
|
%endif
|
|
shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5
|
|
shl y_offsetd, filter_idx_shift
|
|
%if ARCH_X86_64 && mmsize == 16
|
|
mova m8, [bilin_filter+x_offsetq]
|
|
mova m9, [bilin_filter+x_offsetq+16]
|
|
mova m10, [bilin_filter+y_offsetq]
|
|
mova m11, [bilin_filter+y_offsetq+16]
|
|
mova m12, [pw_8]
|
|
%define filter_x_a m8
|
|
%define filter_x_b m9
|
|
%define filter_y_a m10
|
|
%define filter_y_b m11
|
|
%define filter_rnd m12
|
|
%else ; x86-32
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
; In this case, there is NO unused register. Used src_stride register. Later,
|
|
; src_stride has to be loaded from stack when it is needed.
|
|
%define tempq src_strideq
|
|
mov tempq, g_bilin_filterm
|
|
add x_offsetq, tempq
|
|
add y_offsetq, tempq
|
|
%define filter_x_a [x_offsetq]
|
|
%define filter_x_b [x_offsetq+16]
|
|
%define filter_y_a [y_offsetq]
|
|
%define filter_y_b [y_offsetq+16]
|
|
|
|
mov tempq, g_pw_8m
|
|
%define filter_rnd [tempq]
|
|
%else
|
|
add x_offsetq, bilin_filter
|
|
add y_offsetq, bilin_filter
|
|
%define filter_x_a [x_offsetq]
|
|
%define filter_x_b [x_offsetq+16]
|
|
%define filter_y_a [y_offsetq]
|
|
%define filter_y_b [y_offsetq+16]
|
|
%define filter_rnd [pw_8]
|
|
%endif
|
|
%endif
|
|
; end of load filter
|
|
|
|
; x_offset == bilin interpolation && y_offset == bilin interpolation
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m2, [srcq+2]
|
|
movu m1, [srcq+16]
|
|
movu m3, [srcq+18]
|
|
pmullw m0, filter_x_a
|
|
pmullw m2, filter_x_b
|
|
paddw m0, filter_rnd
|
|
pmullw m1, filter_x_a
|
|
pmullw m3, filter_x_b
|
|
paddw m1, filter_rnd
|
|
paddw m0, m2
|
|
paddw m1, m3
|
|
psrlw m0, 4
|
|
psrlw m1, 4
|
|
|
|
INC_SRC_BY_SRC_STRIDE
|
|
|
|
.x_other_y_other_loop:
|
|
movu m2, [srcq]
|
|
movu m4, [srcq+2]
|
|
movu m3, [srcq+16]
|
|
movu m5, [srcq+18]
|
|
pmullw m2, filter_x_a
|
|
pmullw m4, filter_x_b
|
|
paddw m2, filter_rnd
|
|
pmullw m3, filter_x_a
|
|
pmullw m5, filter_x_b
|
|
paddw m3, filter_rnd
|
|
paddw m2, m4
|
|
paddw m3, m5
|
|
psrlw m2, 4
|
|
psrlw m3, 4
|
|
mova m4, m2
|
|
mova m5, m3
|
|
pmullw m0, filter_y_a
|
|
pmullw m2, filter_y_b
|
|
paddw m0, filter_rnd
|
|
pmullw m1, filter_y_a
|
|
pmullw m3, filter_y_b
|
|
paddw m0, m2
|
|
paddw m1, filter_rnd
|
|
mova m2, [dstq]
|
|
paddw m1, m3
|
|
psrlw m0, 4
|
|
psrlw m1, 4
|
|
mova m3, [dstq+16]
|
|
%if %2 == 1 ; avg
|
|
pavgw m0, [secq]
|
|
pavgw m1, [secq+16]
|
|
%endif
|
|
SUM_SSE m0, m2, m1, m3, m6, m7
|
|
mova m0, m4
|
|
mova m1, m5
|
|
|
|
INC_SRC_BY_SRC_STRIDE
|
|
lea dstq, [dstq + dst_strideq * 2]
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
%else ; %1 < 16
|
|
movu m0, [srcq]
|
|
movu m2, [srcq+2]
|
|
pmullw m0, filter_x_a
|
|
pmullw m2, filter_x_b
|
|
paddw m0, filter_rnd
|
|
paddw m0, m2
|
|
psrlw m0, 4
|
|
|
|
INC_SRC_BY_SRC_STRIDE
|
|
|
|
.x_other_y_other_loop:
|
|
movu m2, [srcq]
|
|
movu m4, [srcq+2]
|
|
movu m3, [srcq+src_strideq*2]
|
|
movu m5, [srcq+src_strideq*2+2]
|
|
pmullw m2, filter_x_a
|
|
pmullw m4, filter_x_b
|
|
paddw m2, filter_rnd
|
|
pmullw m3, filter_x_a
|
|
pmullw m5, filter_x_b
|
|
paddw m3, filter_rnd
|
|
paddw m2, m4
|
|
paddw m3, m5
|
|
psrlw m2, 4
|
|
psrlw m3, 4
|
|
mova m4, m2
|
|
mova m5, m3
|
|
pmullw m0, filter_y_a
|
|
pmullw m2, filter_y_b
|
|
paddw m0, filter_rnd
|
|
pmullw m4, filter_y_a
|
|
pmullw m3, filter_y_b
|
|
paddw m0, m2
|
|
paddw m4, filter_rnd
|
|
mova m2, [dstq]
|
|
paddw m4, m3
|
|
psrlw m0, 4
|
|
psrlw m4, 4
|
|
mova m3, [dstq+dst_strideq*2]
|
|
%if %2 == 1 ; avg
|
|
pavgw m0, [secq]
|
|
add secq, sec_str
|
|
pavgw m4, [secq]
|
|
%endif
|
|
SUM_SSE m0, m2, m4, m3, m6, m7
|
|
mova m0, m5
|
|
|
|
INC_SRC_BY_SRC_2STRIDE
|
|
lea dstq, [dstq + dst_strideq * 4]
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
%endif
|
|
dec h
|
|
jg .x_other_y_other_loop
|
|
%undef filter_x_a
|
|
%undef filter_x_b
|
|
%undef filter_y_a
|
|
%undef filter_y_b
|
|
%undef filter_rnd
|
|
STORE_AND_RET
|
|
%endmacro
|
|
|
|
INIT_XMM sse2
|
|
SUBPEL_VARIANCE 8
|
|
SUBPEL_VARIANCE 16
|
|
|
|
INIT_XMM sse2
|
|
SUBPEL_VARIANCE 8, 1
|
|
SUBPEL_VARIANCE 16, 1
|