VPX: Remove pmin/pmax from subpixel functions.

These instructions are unnecessary if the adds
are done in the correct order.

Change-Id: I4e533b8267c32e610a4b94203ad052dc9fdabd71
This commit is contained in:
Scott LaVarnway
2016-02-24 12:03:33 -08:00
parent 51beb29f52
commit dd6729f826

View File

@@ -16,6 +16,11 @@ pw_64: times 8 dw 64
; %define USE_PMULHRSW ; %define USE_PMULHRSW
; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss ; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss
; when using this instruction. ; when using this instruction.
;
; The add order below (based on ffvp9) must be followed to prevent outranges.
; x = k0k1 + k4k5
; y = k2k3 + k6k7
; z = signed SAT(x + y)
SECTION .text SECTION .text
%if ARCH_X86_64 %if ARCH_X86_64
@@ -77,17 +82,12 @@ SECTION .text
pmaddubsw %2, k0k1k4k5 pmaddubsw %2, k0k1k4k5
pmaddubsw m3, k2k3k6k7 pmaddubsw m3, k2k3k6k7
mova m4, %2 ;k0k1
mova m4, %2 mova m5, m3 ;k2k3
mova m5, m3 psrldq %2, 8 ;k4k5
psrldq %2, 8 psrldq m3, 8 ;k6k7
psrldq m3, 8
mova m6, m5
paddsw m4, m3
pmaxsw m5, %2
pminsw %2, m6
paddsw %2, m4 paddsw %2, m4
paddsw m5, m3
paddsw %2, m5 paddsw %2, m5
paddsw %2, krd paddsw %2, krd
psraw %2, 7 psraw %2, 7
@@ -157,27 +157,20 @@ cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
pmaddubsw m7, k0k1k4k5 pmaddubsw m7, k0k1k4k5
palignr m3, m2, 5 palignr m3, m2, 5
pmaddubsw m3, k2k3k6k7 pmaddubsw m3, k2k3k6k7
mova m0, m4 mova m0, m4 ;k0k1
mova m5, m1 mova m5, m1 ;k2k3
mova m2, m7 mova m2, m7 ;k0k1 upper
psrldq m4, 8 psrldq m4, 8 ;k4k5
psrldq m1, 8 psrldq m1, 8 ;k6k7
mova m6, m5
paddsw m0, m1
mova m1, m3
psrldq m7, 8
psrldq m3, 8
paddsw m2, m3
mova m3, m1
pmaxsw m5, m4
pminsw m4, m6
paddsw m4, m0 paddsw m4, m0
paddsw m4, m5 paddsw m5, m1
pmaxsw m1, m7 mova m1, m3 ;k2k3 upper
pminsw m7, m3 psrldq m7, 8 ;k4k5 upper
psrldq m3, 8 ;k6k7 upper
paddsw m7, m2 paddsw m7, m2
paddsw m4, m5
paddsw m1, m3
paddsw m7, m1 paddsw m7, m1
paddsw m4, krd paddsw m4, krd
psraw m4, 7 psraw m4, 7
packuswb m4, m4 packuswb m4, m4
@@ -240,16 +233,13 @@ cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
pmaddubsw %3, k2k3 pmaddubsw %3, k2k3
pmaddubsw %4, k4k5 pmaddubsw %4, k4k5
pmaddubsw %5, k6k7 pmaddubsw %5, k6k7
paddsw %2, %4
paddsw %5, %3
paddsw %2, %5 paddsw %2, %5
mova %1, %3 paddsw %2, krd
pminsw %3, %4 psraw %2, 7
pmaxsw %1, %4 packuswb %2, %2
paddsw %2, %3 SWAP %1, %2
paddsw %1, %2
paddsw %1, krd
psraw %1, 7
packuswb %1, %1
%endm %endm
;------------------------------------------------------------------------------- ;-------------------------------------------------------------------------------
@@ -293,39 +283,33 @@ cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 14, LOCAL_VARS_SIZE, \
pmaddubsw m3, k4k5 pmaddubsw m3, k4k5
palignr m7, m4, 13 palignr m7, m4, 13
paddsw m1, m5
mova m5, m6
mova m0, m2
palignr m5, m4, 5
pminsw m2, m3
pmaddubsw m7, k6k7
pmaxsw m3, m0
paddsw m1, m2
mova m0, m6 mova m0, m6
palignr m6, m4, 1 palignr m0, m4, 5
pmaddubsw m5, k2k3 pmaddubsw m7, k6k7
paddsw m1, m3 paddsw m1, m3
paddsw m2, m5
paddsw m1, m2
mova m5, m6
palignr m6, m4, 1
pmaddubsw m0, k2k3
pmaddubsw m6, k0k1 pmaddubsw m6, k0k1
palignr m0, m4, 9 palignr m5, m4, 9
paddsw m1, krd paddsw m1, krd
pmaddubsw m0, k4k5 pmaddubsw m5, k4k5
mova m4, m5
psraw m1, 7 psraw m1, 7
pminsw m5, m0 paddsw m0, m7
paddsw m6, m7 %ifidn %1, h8_avg
movh m7, [dstq]
movh m2, [dstq + dstrideq]
%endif
packuswb m1, m1 packuswb m1, m1
paddsw m6, m5 paddsw m6, m5
pmaxsw m0, m4
paddsw m6, m0 paddsw m6, m0
paddsw m6, krd paddsw m6, krd
psraw m6, 7 psraw m6, 7
packuswb m6, m6 packuswb m6, m6
%ifidn %1, h8_avg %ifidn %1, h8_avg
movh m0, [dstq] pavgb m1, m7
movh m2, [dstq + dstrideq]
pavgb m1, m0
pavgb m6, m2 pavgb m6, m2
%endif %endif
movh [dstq], m1 movh [dstq], m1
@@ -388,7 +372,7 @@ cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \
pmaddubsw m1, k2k3 pmaddubsw m1, k2k3
palignr m2, m7, 9 palignr m2, m7, 9
pmaddubsw m2, k4k5 pmaddubsw m2, k4k5
paddsw m0, m3 paddsw m1, m3
mova m3, m4 mova m3, m4
punpckhbw m4, m4 punpckhbw m4, m4
mova m5, m4 mova m5, m4
@@ -403,17 +387,13 @@ cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \
pmaddubsw m6, k4k5 pmaddubsw m6, k4k5
palignr m7, m3, 13 palignr m7, m3, 13
pmaddubsw m7, k6k7 pmaddubsw m7, k6k7
mova m3, m1
pmaxsw m1, m2
pminsw m2, m3
paddsw m0, m2 paddsw m0, m2
paddsw m0, m1 paddsw m0, m1
paddsw m4, m7 %ifidn %1, h8_avg
mova m7, m5 mova m1, [dstq]
pmaxsw m5, m6 %endif
pminsw m6, m7
paddsw m4, m6 paddsw m4, m6
paddsw m5, m7
paddsw m4, m5 paddsw m4, m5
paddsw m0, krd paddsw m0, krd
paddsw m4, krd paddsw m4, krd
@@ -421,7 +401,6 @@ cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \
psraw m4, 7 psraw m4, 7
packuswb m0, m4 packuswb m0, m4
%ifidn %1, h8_avg %ifidn %1, h8_avg
mova m1, [dstq]
pavgb m0, m1 pavgb m0, m1
%endif %endif
lea srcq, [srcq + sstrideq] lea srcq, [srcq + sstrideq]
@@ -488,27 +467,21 @@ cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
movx m7, [src1q + sstride6q ] ;H movx m7, [src1q + sstride6q ] ;H
punpcklbw m6, m7 ;G H punpcklbw m6, m7 ;G H
pmaddubsw m6, k6k7 pmaddubsw m6, k6k7
mova tmp, m2
pmaddubsw m3, k2k3 pmaddubsw m3, k2k3
pmaddubsw m1, k0k1 pmaddubsw m1, k0k1
pmaxsw m2, m4 paddsw m0, m4
paddsw m0, m6 paddsw m2, m6
movx m6, [srcq + sstrideq * 8 ] ;H next iter movx m6, [srcq + sstrideq * 8 ] ;H next iter
punpcklbw m7, m6 punpcklbw m7, m6
pmaddubsw m7, k6k7 pmaddubsw m7, k6k7
pminsw m4, tmp
paddsw m0, m4
mova m4, m3
paddsw m0, m2 paddsw m0, m2
pminsw m3, m5
pmaxsw m5, m4
paddsw m0, krd paddsw m0, krd
psraw m0, 7 psraw m0, 7
paddsw m1, m7 paddsw m1, m5
packuswb m0, m0 packuswb m0, m0
paddsw m3, m7
paddsw m1, m3 paddsw m1, m3
paddsw m1, m5
paddsw m1, krd paddsw m1, krd
psraw m1, 7 psraw m1, 7
lea srcq, [srcq + sstrideq * 2 ] lea srcq, [srcq + sstrideq * 2 ]
@@ -550,10 +523,7 @@ cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
punpcklbw m4, m5 ;E F punpcklbw m4, m5 ;E F
pmaddubsw m2, k2k3 pmaddubsw m2, k2k3
pmaddubsw m4, k4k5 pmaddubsw m4, k4k5
paddsw m0, m6 paddsw m2, m6
mova m1, m2
pmaxsw m2, m4
pminsw m4, m1
paddsw m0, m4 paddsw m0, m4
paddsw m0, m2 paddsw m0, m2
paddsw m0, krd paddsw m0, krd
@@ -610,12 +580,9 @@ cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
punpcklbw m3, m5 ;A B punpcklbw m3, m5 ;A B
movh m7, [srcq + sstrideq * 2 + 8] ;C movh m7, [srcq + sstrideq * 2 + 8] ;C
pmaddubsw m6, k6k7 pmaddubsw m6, k6k7
mova m1, m2
movh m5, [src1q + sstrideq * 2 + 8] ;D movh m5, [src1q + sstrideq * 2 + 8] ;D
pmaxsw m2, m4
punpcklbw m7, m5 ;C D punpcklbw m7, m5 ;C D
pminsw m4, m1 paddsw m2, m6
paddsw m0, m6
pmaddubsw m3, k0k1 pmaddubsw m3, k0k1
movh m1, [srcq + sstrideq * 4 + 8] ;E movh m1, [srcq + sstrideq * 4 + 8] ;E
paddsw m0, m4 paddsw m0, m4
@@ -633,10 +600,8 @@ cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
%ifidn %1, v8_avg %ifidn %1, v8_avg
mova m4, [dstq] mova m4, [dstq]
%endif %endif
mova m6, m7 movh [dstq], m0
pmaxsw m7, m1 paddsw m7, m2
pminsw m1, m6
paddsw m3, m2
paddsw m3, m1 paddsw m3, m1
paddsw m3, m7 paddsw m3, m7
paddsw m3, krd paddsw m3, krd