vpx/vp9/encoder/x86/vp9_subpel_variance.asm
Ronald S. Bultje 8fb6c58191 Implement sse2 and ssse3 versions for all sub_pixel_variance sizes.
Overall speedup around 5% (bus @ 1500kbps first 50 frames 4min10 ->
3min58). Specific changes to timings for each function compared to
original assembly-optimized versions (or just new version timings if
no previous assembly-optimized version was available):

sse2   4x4:    99 ->   82 cycles
sse2   4x8:           128 cycles
sse2   8x4:           121 cycles
sse2   8x8:   149 ->  129 cycles
sse2   8x16:  235 ->  245 cycles (?)
sse2  16x8:   269 ->  203 cycles
sse2  16x16:  441 ->  349 cycles
sse2  16x32:          641 cycles
sse2  32x16:          643 cycles
sse2  32x32: 1733 -> 1154 cycles
sse2  32x64:         2247 cycles
sse2  64x32:         2323 cycles
sse2  64x64: 6984 -> 4442 cycles

ssse3  4x4:           100 cycles (?)
ssse3  4x8:           103 cycles
ssse3  8x4:            71 cycles
ssse3  8x8:           147 cycles
ssse3  8x16:          158 cycles
ssse3 16x8:   188 ->  162 cycles
ssse3 16x16:  316 ->  273 cycles
ssse3 16x32:          535 cycles
ssse3 32x16:          564 cycles
ssse3 32x32:          973 cycles
ssse3 32x64:         1930 cycles
ssse3 64x32:         1922 cycles
ssse3 64x64:         3760 cycles

Change-Id: I81ff6fe51daf35a40d19785167004664d7e0c59d
2013-06-20 09:34:25 -07:00

1062 lines
33 KiB
NASM

;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "third_party/x86inc/x86inc.asm"
SECTION_RODATA
pw_8: times 8 dw 8
bilin_filter_m_sse2: times 8 dw 16
times 8 dw 0
times 8 dw 15
times 8 dw 1
times 8 dw 14
times 8 dw 2
times 8 dw 13
times 8 dw 3
times 8 dw 12
times 8 dw 4
times 8 dw 11
times 8 dw 5
times 8 dw 10
times 8 dw 6
times 8 dw 9
times 8 dw 7
times 16 dw 8
times 8 dw 7
times 8 dw 9
times 8 dw 6
times 8 dw 10
times 8 dw 5
times 8 dw 11
times 8 dw 4
times 8 dw 12
times 8 dw 3
times 8 dw 13
times 8 dw 2
times 8 dw 14
times 8 dw 1
times 8 dw 15
bilin_filter_m_ssse3: times 8 db 16, 0
times 8 db 15, 1
times 8 db 14, 2
times 8 db 13, 3
times 8 db 12, 4
times 8 db 11, 5
times 8 db 10, 6
times 8 db 9, 7
times 16 db 8
times 8 db 7, 9
times 8 db 6, 10
times 8 db 5, 11
times 8 db 4, 12
times 8 db 3, 13
times 8 db 2, 14
times 8 db 1, 15
; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
; int x_offset, int y_offset,
; const uint8_t *dst, ptrdiff_t dst_stride,
; int height, unsigned int *sse);
;
; This function returns the SE and stores SSE in the given pointer.
%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
psubw %3, %4
psubw %1, %2
paddw %5, %3
pmaddwd %3, %3
paddw %5, %1
pmaddwd %1, %1
paddd %6, %3
paddd %6, %1
%endmacro
%macro STORE_AND_RET 0
%if mmsize == 16
; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
; We have to sign-extend it before adding the words within the register
; and outputing to a dword.
pcmpgtw m5, m6 ; mask for 0 > x
movhlps m3, m7
punpcklwd m4, m6, m5
punpckhwd m6, m5 ; sign-extend m6 word->dword
paddd m7, m3
paddd m6, m4
pshufd m3, m7, 0x1
movhlps m4, m6
paddd m7, m3
paddd m6, m4
mov r1, ssem ; r1 = unsigned int *sse
pshufd m4, m6, 0x1
movd [r1], m7 ; store sse
paddd m6, m4
movd rax, m6 ; store sum as return value
%else ; mmsize == 8
pshufw m4, m6, 0xe
pshufw m3, m7, 0xe
paddw m6, m4
paddd m7, m3
pcmpgtw m5, m6 ; mask for 0 > x
mov r1, ssem ; r1 = unsigned int *sse
punpcklwd m6, m5 ; sign-extend m6 word->dword
movd [r1], m7 ; store sse
pshufw m4, m6, 0xe
paddd m6, m4
movd rax, m6 ; store sum as return value
%endif
RET
%endmacro
%macro SUBPEL_VARIANCE 1 ; W
%if cpuflag(ssse3)
%define bilin_filter_m bilin_filter_m_ssse3
%define filter_idx_shift 4
%else
%define bilin_filter_m bilin_filter_m_sse2
%define filter_idx_shift 5
%endif
; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
; 11, not 13, if the registers are ordered correctly. May make a minor speed
; difference on Win64
%ifdef PIC
cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \
dst, dst_stride, height, sse
%define bilin_filter sseq
%else
cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
dst, dst_stride, height, sse
%define bilin_filter bilin_filter_m
%endif
ASSERT %1 <= 16 ; m6 overflows if w > 16
pxor m6, m6 ; sum
pxor m7, m7 ; sse
; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
; could perhaps use it for something more productive then
pxor m5, m5 ; dedicated zero register
%if %1 < 16
sar heightd, 1
%endif
; FIXME(rbultje) replace by jumptable?
test x_offsetd, x_offsetd
jnz .x_nonzero
; x_offset == 0
test y_offsetd, y_offsetd
jnz .x_zero_y_nonzero
; x_offset == 0 && y_offset == 0
.x_zero_y_zero_loop:
%if %1 == 16
movu m0, [srcq]
mova m1, [dstq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
punpckhbw m3, m1, m5
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
add srcq, src_strideq
add dstq, dst_strideq
dec heightd
%else ; %1 < 16
movh m0, [srcq]
movh m2, [srcq+src_strideq]
movh m1, [dstq]
movh m3, [dstq+dst_strideq]
punpcklbw m0, m5
punpcklbw m2, m5
punpcklbw m3, m5
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
dec heightd
%endif
jg .x_zero_y_zero_loop
STORE_AND_RET
.x_zero_y_nonzero:
cmp y_offsetd, 8
jne .x_zero_y_nonhalf
; x_offset == 0 && y_offset == 0.5
.x_zero_y_half_loop:
%if %1 == 16
movu m0, [srcq]
movu m4, [srcq+src_strideq]
mova m1, [dstq]
pavgb m0, m4
punpckhbw m2, m0, m5
punpcklbw m0, m5
punpckhbw m3, m1, m5
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
add srcq, src_strideq
add dstq, dst_strideq
dec heightd
%else ; %1 < 16
movh m0, [srcq]
movh m2, [srcq+src_strideq]
movh m4, [srcq+src_strideq*2]
movh m1, [dstq]
pavgb m0, m2
movh m3, [dstq+dst_strideq]
pavgb m2, m4
punpcklbw m0, m5
punpcklbw m2, m5
punpcklbw m3, m5
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
dec heightd
%endif
jg .x_zero_y_half_loop
STORE_AND_RET
.x_zero_y_nonhalf:
; x_offset == 0 && y_offset == bilin interpolation
%ifdef PIC
lea bilin_filter, [bilin_filter_m]
%endif
shl y_offsetd, filter_idx_shift
%if ARCH_X86_64 && mmsize == 16
mova m8, [bilin_filter+y_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+y_offsetq+16]
%endif
mova m10, [pw_8]
%define filter_y_a m8
%define filter_y_b m9
%define filter_rnd m10
%else ; x86-32 or mmx
add y_offsetq, bilin_filter
%define filter_y_a [y_offsetq]
%define filter_y_b [y_offsetq+16]
%define filter_rnd [pw_8]
%endif
.x_zero_y_other_loop:
%if %1 == 16
movu m0, [srcq]
movu m4, [srcq+src_strideq]
mova m1, [dstq]
%if cpuflag(ssse3)
punpckhbw m2, m0, m4
punpcklbw m0, m4
pmaddubsw m2, filter_y_a
pmaddubsw m0, filter_y_a
paddw m2, filter_rnd
paddw m0, filter_rnd
%else
punpckhbw m2, m0, m5
punpckhbw m3, m4, m5
punpcklbw m0, m5
punpcklbw m4, m5
; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
; instructions is the same (5), but it is 1 mul instead of 2, so might be
; slightly faster because of pmullw latency. It would also cut our rodata
; tables in half for this function, and save 1-2 registers on x86-64.
pmullw m2, filter_y_a
pmullw m3, filter_y_b
paddw m2, filter_rnd
pmullw m0, filter_y_a
pmullw m4, filter_y_b
paddw m0, filter_rnd
paddw m2, m3
paddw m0, m4
%endif
psraw m2, 4
psraw m0, 4
punpckhbw m3, m1, m5
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
add srcq, src_strideq
add dstq, dst_strideq
dec heightd
%else ; %1 < 16
movh m0, [srcq]
movh m2, [srcq+src_strideq]
movh m4, [srcq+src_strideq*2]
movh m3, [dstq+dst_strideq]
%if cpuflag(ssse3)
movh m1, [dstq]
punpcklbw m0, m2
punpcklbw m2, m4
pmaddubsw m0, filter_y_a
pmaddubsw m2, filter_y_a
punpcklbw m3, m5
paddw m2, filter_rnd
paddw m0, filter_rnd
%else
punpcklbw m0, m5
punpcklbw m2, m5
punpcklbw m4, m5
pmullw m0, filter_y_a
pmullw m1, m2, filter_y_b
punpcklbw m3, m5
paddw m0, filter_rnd
pmullw m2, filter_y_a
pmullw m4, filter_y_b
paddw m0, m1
paddw m2, filter_rnd
movh m1, [dstq]
paddw m2, m4
%endif
psraw m0, 4
psraw m2, 4
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
dec heightd
%endif
jg .x_zero_y_other_loop
%undef filter_y_a
%undef filter_y_b
%undef filter_rnd
STORE_AND_RET
.x_nonzero:
cmp x_offsetd, 8
jne .x_nonhalf
; x_offset == 0.5
test y_offsetd, y_offsetd
jnz .x_half_y_nonzero
; x_offset == 0.5 && y_offset == 0
.x_half_y_zero_loop:
%if %1 == 16
movu m0, [srcq]
movu m4, [srcq+1]
mova m1, [dstq]
pavgb m0, m4
punpckhbw m2, m0, m5
punpcklbw m0, m5
punpckhbw m3, m1, m5
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
add srcq, src_strideq
add dstq, dst_strideq
dec heightd
%else ; %1 < 16
movh m0, [srcq]
movh m4, [srcq+1]
movh m2, [srcq+src_strideq]
movh m1, [dstq]
pavgb m0, m4
movh m4, [srcq+src_strideq+1]
movh m3, [dstq+dst_strideq]
pavgb m2, m4
punpcklbw m0, m5
punpcklbw m2, m5
punpcklbw m3, m5
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
dec heightd
%endif
jg .x_half_y_zero_loop
STORE_AND_RET
.x_half_y_nonzero:
cmp y_offsetd, 8
jne .x_half_y_nonhalf
; x_offset == 0.5 && y_offset == 0.5
%if %1 == 16
movu m0, [srcq]
movu m3, [srcq+1]
add srcq, src_strideq
pavgb m0, m3
.x_half_y_half_loop:
movu m4, [srcq]
movu m3, [srcq+1]
mova m1, [dstq]
pavgb m4, m3
pavgb m0, m4
punpckhbw m2, m0, m5
punpcklbw m0, m5
punpckhbw m3, m1, m5
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
add srcq, src_strideq
add dstq, dst_strideq
dec heightd
%else ; %1 < 16
movh m0, [srcq]
movh m3, [srcq+1]
add srcq, src_strideq
pavgb m0, m3
.x_half_y_half_loop:
movh m2, [srcq]
movh m3, [srcq+1]
movh m4, [srcq+src_strideq]
movh m1, [srcq+src_strideq+1]
pavgb m2, m3
pavgb m4, m1
pavgb m0, m2
pavgb m2, m4
movh m1, [dstq]
movh m3, [dstq+dst_strideq]
punpcklbw m0, m5
punpcklbw m2, m5
punpcklbw m3, m5
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
dec heightd
%endif
jg .x_half_y_half_loop
STORE_AND_RET
.x_half_y_nonhalf:
; x_offset == 0.5 && y_offset == bilin interpolation
%ifdef PIC
lea bilin_filter, [bilin_filter_m]
%endif
shl y_offsetd, filter_idx_shift
%if ARCH_X86_64 && mmsize == 16
mova m8, [bilin_filter+y_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+y_offsetq+16]
%endif
mova m10, [pw_8]
%define filter_y_a m8
%define filter_y_b m9
%define filter_rnd m10
%else
add y_offsetq, bilin_filter
%define filter_y_a [y_offsetq]
%define filter_y_b [y_offsetq+16]
%define filter_rnd [pw_8]
%endif
%if %1 == 16
movu m0, [srcq]
movu m3, [srcq+1]
add srcq, src_strideq
pavgb m0, m3
.x_half_y_other_loop:
movu m4, [srcq]
movu m2, [srcq+1]
mova m1, [dstq]
pavgb m4, m2
%if cpuflag(ssse3)
punpckhbw m2, m0, m4
punpcklbw m0, m4
pmaddubsw m2, filter_y_a
pmaddubsw m0, filter_y_a
paddw m2, filter_rnd
paddw m0, filter_rnd
psraw m2, 4
%else
punpckhbw m2, m0, m5
punpckhbw m3, m4, m5
pmullw m2, filter_y_a
pmullw m3, filter_y_b
paddw m2, filter_rnd
punpcklbw m0, m5
paddw m2, m3
punpcklbw m3, m4, m5
pmullw m0, filter_y_a
pmullw m3, filter_y_b
paddw m0, filter_rnd
psraw m2, 4
paddw m0, m3
%endif
punpckhbw m3, m1, m5
psraw m0, 4
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
add srcq, src_strideq
add dstq, dst_strideq
dec heightd
%else ; %1 < 16
movh m0, [srcq]
movh m3, [srcq+1]
add srcq, src_strideq
pavgb m0, m3
%if notcpuflag(ssse3)
punpcklbw m0, m5
%endif
.x_half_y_other_loop:
movh m2, [srcq]
movh m1, [srcq+1]
movh m4, [srcq+src_strideq]
movh m3, [srcq+src_strideq+1]
pavgb m2, m1
pavgb m4, m3
movh m3, [dstq+dst_strideq]
%if cpuflag(ssse3)
movh m1, [dstq]
punpcklbw m0, m2
punpcklbw m2, m4
pmaddubsw m0, filter_y_a
pmaddubsw m2, filter_y_a
punpcklbw m3, m5
paddw m0, filter_rnd
paddw m2, filter_rnd
%else
punpcklbw m2, m5
punpcklbw m4, m5
pmullw m0, filter_y_a
pmullw m1, m2, filter_y_b
punpcklbw m3, m5
paddw m0, filter_rnd
pmullw m2, filter_y_a
paddw m0, m1
pmullw m1, m4, filter_y_b
paddw m2, filter_rnd
paddw m2, m1
movh m1, [dstq]
%endif
psraw m0, 4
psraw m2, 4
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
dec heightd
%endif
jg .x_half_y_other_loop
%undef filter_y_a
%undef filter_y_b
%undef filter_rnd
STORE_AND_RET
.x_nonhalf:
test y_offsetd, y_offsetd
jnz .x_nonhalf_y_nonzero
; x_offset == bilin interpolation && y_offset == 0
%ifdef PIC
lea bilin_filter, [bilin_filter_m]
%endif
shl x_offsetd, filter_idx_shift
%if ARCH_X86_64 && mmsize == 16
mova m8, [bilin_filter+x_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+x_offsetq+16]
%endif
mova m10, [pw_8]
%define filter_x_a m8
%define filter_x_b m9
%define filter_rnd m10
%else
add x_offsetq, bilin_filter
%define filter_x_a [x_offsetq]
%define filter_x_b [x_offsetq+16]
%define filter_rnd [pw_8]
%endif
.x_other_y_zero_loop:
%if %1 == 16
movu m0, [srcq]
movu m4, [srcq+1]
mova m1, [dstq]
%if cpuflag(ssse3)
punpckhbw m2, m0, m4
punpcklbw m0, m4
pmaddubsw m2, filter_x_a
pmaddubsw m0, filter_x_a
paddw m2, filter_rnd
paddw m0, filter_rnd
%else
punpckhbw m2, m0, m5
punpckhbw m3, m4, m5
punpcklbw m0, m5
punpcklbw m4, m5
pmullw m2, filter_x_a
pmullw m3, filter_x_b
paddw m2, filter_rnd
pmullw m0, filter_x_a
pmullw m4, filter_x_b
paddw m0, filter_rnd
paddw m2, m3
paddw m0, m4
%endif
psraw m2, 4
psraw m0, 4
punpckhbw m3, m1, m5
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
add srcq, src_strideq
add dstq, dst_strideq
dec heightd
%else ; %1 < 16
movh m0, [srcq]
movh m1, [srcq+1]
movh m2, [srcq+src_strideq]
movh m4, [srcq+src_strideq+1]
movh m3, [dstq+dst_strideq]
%if cpuflag(ssse3)
punpcklbw m0, m1
movh m1, [dstq]
punpcklbw m2, m4
pmaddubsw m0, filter_x_a
pmaddubsw m2, filter_x_a
punpcklbw m3, m5
paddw m0, filter_rnd
paddw m2, filter_rnd
%else
punpcklbw m0, m5
punpcklbw m1, m5
punpcklbw m2, m5
punpcklbw m4, m5
pmullw m0, filter_x_a
pmullw m1, filter_x_b
punpcklbw m3, m5
paddw m0, filter_rnd
pmullw m2, filter_x_a
pmullw m4, filter_x_b
paddw m0, m1
paddw m2, filter_rnd
movh m1, [dstq]
paddw m2, m4
%endif
psraw m0, 4
psraw m2, 4
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
dec heightd
%endif
jg .x_other_y_zero_loop
%undef filter_x_a
%undef filter_x_b
%undef filter_rnd
STORE_AND_RET
.x_nonhalf_y_nonzero:
cmp y_offsetd, 8
jne .x_nonhalf_y_nonhalf
; x_offset == bilin interpolation && y_offset == 0.5
%ifdef PIC
lea bilin_filter, [bilin_filter_m]
%endif
shl x_offsetd, filter_idx_shift
%if ARCH_X86_64 && mmsize == 16
mova m8, [bilin_filter+x_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+x_offsetq+16]
%endif
mova m10, [pw_8]
%define filter_x_a m8
%define filter_x_b m9
%define filter_rnd m10
%else
add x_offsetq, bilin_filter
%define filter_x_a [x_offsetq]
%define filter_x_b [x_offsetq+16]
%define filter_rnd [pw_8]
%endif
%if %1 == 16
movu m0, [srcq]
movu m1, [srcq+1]
%if cpuflag(ssse3)
punpckhbw m2, m0, m1
punpcklbw m0, m1
pmaddubsw m2, filter_x_a
pmaddubsw m0, filter_x_a
paddw m2, filter_rnd
paddw m0, filter_rnd
%else
punpckhbw m2, m0, m5
punpckhbw m3, m1, m5
punpcklbw m0, m5
punpcklbw m1, m5
pmullw m0, filter_x_a
pmullw m1, filter_x_b
paddw m0, filter_rnd
pmullw m2, filter_x_a
pmullw m3, filter_x_b
paddw m2, filter_rnd
paddw m0, m1
paddw m2, m3
%endif
psraw m0, 4
psraw m2, 4
add srcq, src_strideq
packuswb m0, m2
.x_other_y_half_loop:
movu m4, [srcq]
movu m3, [srcq+1]
%if cpuflag(ssse3)
mova m1, [dstq]
punpckhbw m2, m4, m3
punpcklbw m4, m3
pmaddubsw m2, filter_x_a
pmaddubsw m4, filter_x_a
paddw m2, filter_rnd
paddw m4, filter_rnd
psraw m2, 4
psraw m4, 4
packuswb m4, m2
pavgb m0, m4
punpckhbw m3, m1, m5
punpcklbw m1, m5
punpckhbw m2, m0, m5
punpcklbw m0, m5
%else
punpckhbw m2, m4, m5
punpckhbw m1, m3, m5
punpcklbw m4, m5
punpcklbw m3, m5
pmullw m4, filter_x_a
pmullw m3, filter_x_b
paddw m4, filter_rnd
pmullw m2, filter_x_a
pmullw m1, filter_x_b
paddw m2, filter_rnd
paddw m4, m3
paddw m2, m1
mova m1, [dstq]
psraw m4, 4
psraw m2, 4
punpckhbw m3, m1, m5
; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
; have a 1-register shortage to be able to store the backup of the bilin
; filtered second line as words as cache for the next line. Packing into
; a byte costs 1 pack and 2 unpacks, but saves a register.
packuswb m4, m2
punpcklbw m1, m5
pavgb m0, m4
punpckhbw m2, m0, m5
punpcklbw m0, m5
%endif
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
add srcq, src_strideq
add dstq, dst_strideq
dec heightd
%else ; %1 < 16
movh m0, [srcq]
movh m1, [srcq+1]
%if cpuflag(ssse3)
punpcklbw m0, m1
pmaddubsw m0, filter_x_a
paddw m0, filter_rnd
%else
punpcklbw m0, m5
punpcklbw m1, m5
pmullw m0, filter_x_a
pmullw m1, filter_x_b
paddw m0, filter_rnd
paddw m0, m1
%endif
add srcq, src_strideq
psraw m0, 4
.x_other_y_half_loop:
movh m2, [srcq]
movh m1, [srcq+1]
movh m4, [srcq+src_strideq]
movh m3, [srcq+src_strideq+1]
%if cpuflag(ssse3)
punpcklbw m2, m1
punpcklbw m4, m3
pmaddubsw m2, filter_x_a
pmaddubsw m4, filter_x_a
movh m1, [dstq]
movh m3, [dstq+dst_strideq]
paddw m2, filter_rnd
paddw m4, filter_rnd
%else
punpcklbw m2, m5
punpcklbw m1, m5
punpcklbw m4, m5
punpcklbw m3, m5
pmullw m2, filter_x_a
pmullw m1, filter_x_b
paddw m2, filter_rnd
pmullw m4, filter_x_a
pmullw m3, filter_x_b
paddw m4, filter_rnd
paddw m2, m1
movh m1, [dstq]
paddw m4, m3
movh m3, [dstq+dst_strideq]
%endif
psraw m2, 4
psraw m4, 4
pavgw m0, m2
pavgw m2, m4
punpcklbw m3, m5
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
dec heightd
%endif
jg .x_other_y_half_loop
%undef filter_x_a
%undef filter_x_b
%undef filter_rnd
STORE_AND_RET
.x_nonhalf_y_nonhalf:
%ifdef PIC
lea bilin_filter, [bilin_filter_m]
%endif
shl x_offsetd, filter_idx_shift
shl y_offsetd, filter_idx_shift
%if ARCH_X86_64 && mmsize == 16
mova m8, [bilin_filter+x_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+x_offsetq+16]
%endif
mova m10, [bilin_filter+y_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m11, [bilin_filter+y_offsetq+16]
%endif
mova m12, [pw_8]
%define filter_x_a m8
%define filter_x_b m9
%define filter_y_a m10
%define filter_y_b m11
%define filter_rnd m12
%else
add x_offsetq, bilin_filter
add y_offsetq, bilin_filter
%define filter_x_a [x_offsetq]
%define filter_x_b [x_offsetq+16]
%define filter_y_a [y_offsetq]
%define filter_y_b [y_offsetq+16]
%define filter_rnd [pw_8]
%endif
; x_offset == bilin interpolation && y_offset == bilin interpolation
%if %1 == 16
movu m0, [srcq]
movu m1, [srcq+1]
%if cpuflag(ssse3)
punpckhbw m2, m0, m1
punpcklbw m0, m1
pmaddubsw m2, filter_x_a
pmaddubsw m0, filter_x_a
paddw m2, filter_rnd
paddw m0, filter_rnd
%else
punpckhbw m2, m0, m5
punpckhbw m3, m1, m5
punpcklbw m0, m5
punpcklbw m1, m5
pmullw m0, filter_x_a
pmullw m1, filter_x_b
paddw m0, filter_rnd
pmullw m2, filter_x_a
pmullw m3, filter_x_b
paddw m2, filter_rnd
paddw m0, m1
paddw m2, m3
%endif
psraw m0, 4
psraw m2, 4
add srcq, src_strideq
packuswb m0, m2
.x_other_y_other_loop:
%if cpuflag(ssse3)
movu m4, [srcq]
movu m3, [srcq+1]
mova m1, [dstq]
punpckhbw m2, m4, m3
punpcklbw m4, m3
pmaddubsw m2, filter_x_a
pmaddubsw m4, filter_x_a
punpckhbw m3, m1, m5
paddw m2, filter_rnd
paddw m4, filter_rnd
psraw m2, 4
psraw m4, 4
packuswb m4, m2
punpckhbw m2, m0, m4
punpcklbw m0, m4
pmaddubsw m2, filter_y_a
pmaddubsw m0, filter_y_a
punpcklbw m1, m5
paddw m2, filter_rnd
paddw m0, filter_rnd
psraw m2, 4
psraw m0, 4
%else
movu m3, [srcq]
movu m4, [srcq+1]
punpckhbw m1, m3, m5
punpckhbw m2, m4, m5
punpcklbw m3, m5
punpcklbw m4, m5
pmullw m3, filter_x_a
pmullw m4, filter_x_b
paddw m3, filter_rnd
pmullw m1, filter_x_a
pmullw m2, filter_x_b
paddw m1, filter_rnd
paddw m3, m4
paddw m1, m2
psraw m3, 4
psraw m1, 4
packuswb m4, m3, m1
punpckhbw m2, m0, m5
punpcklbw m0, m5
pmullw m2, filter_y_a
pmullw m1, filter_y_b
paddw m2, filter_rnd
pmullw m0, filter_y_a
pmullw m3, filter_y_b
paddw m2, m1
mova m1, [dstq]
paddw m0, filter_rnd
psraw m2, 4
paddw m0, m3
punpckhbw m3, m1, m5
psraw m0, 4
punpcklbw m1, m5
%endif
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
add srcq, src_strideq
add dstq, dst_strideq
dec heightd
%else ; %1 < 16
movh m0, [srcq]
movh m1, [srcq+1]
%if cpuflag(ssse3)
punpcklbw m0, m1
pmaddubsw m0, filter_x_a
paddw m0, filter_rnd
%else
punpcklbw m0, m5
punpcklbw m1, m5
pmullw m0, filter_x_a
pmullw m1, filter_x_b
paddw m0, filter_rnd
paddw m0, m1
%endif
psraw m0, 4
%if cpuflag(ssse3)
packuswb m0, m0
%endif
add srcq, src_strideq
.x_other_y_other_loop:
movh m2, [srcq]
movh m1, [srcq+1]
movh m4, [srcq+src_strideq]
movh m3, [srcq+src_strideq+1]
%if cpuflag(ssse3)
punpcklbw m2, m1
punpcklbw m4, m3
pmaddubsw m2, filter_x_a
pmaddubsw m4, filter_x_a
movh m3, [dstq+dst_strideq]
movh m1, [dstq]
paddw m2, filter_rnd
paddw m4, filter_rnd
psraw m2, 4
psraw m4, 4
packuswb m2, m2
packuswb m4, m4
punpcklbw m0, m2
punpcklbw m2, m4
pmaddubsw m0, filter_y_a
pmaddubsw m2, filter_y_a
punpcklbw m3, m5
paddw m0, filter_rnd
paddw m2, filter_rnd
psraw m0, 4
psraw m2, 4
punpcklbw m1, m5
%else
punpcklbw m2, m5
punpcklbw m1, m5
punpcklbw m4, m5
punpcklbw m3, m5
pmullw m2, filter_x_a
pmullw m1, filter_x_b
paddw m2, filter_rnd
pmullw m4, filter_x_a
pmullw m3, filter_x_b
paddw m4, filter_rnd
paddw m2, m1
paddw m4, m3
psraw m2, 4
psraw m4, 4
pmullw m0, filter_y_a
pmullw m3, m2, filter_y_b
paddw m0, filter_rnd
pmullw m2, filter_y_a
pmullw m1, m4, filter_y_b
paddw m2, filter_rnd
paddw m0, m3
movh m3, [dstq+dst_strideq]
paddw m2, m1
movh m1, [dstq]
psraw m0, 4
psraw m2, 4
punpcklbw m3, m5
punpcklbw m1, m5
%endif
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
dec heightd
%endif
jg .x_other_y_other_loop
%undef filter_x_a
%undef filter_x_b
%undef filter_y_a
%undef filter_y_b
%undef filter_rnd
STORE_AND_RET
%endmacro
; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
; between the ssse3 and non-ssse3 version. It may make sense to merge their
; code in the sense that the ssse3 version would jump to the appropriate
; location in the sse/2 version, rather than duplicating that code in the
; binary.
INIT_MMX sse
SUBPEL_VARIANCE 4
INIT_XMM sse2
SUBPEL_VARIANCE 8
SUBPEL_VARIANCE 16
INIT_MMX ssse3
SUBPEL_VARIANCE 4
INIT_XMM ssse3
SUBPEL_VARIANCE 8
SUBPEL_VARIANCE 16