vpx_subpixel_8t_ssse3: fix reg counts/access
fixes build on windows x64; previously 'heightq' i.e., the 64-bit register was accessed when only the 32-bit value was needed. given this is from a stack variable the upper bits were undefined. + bump register/xmm counts; users of SETUP_LOCAL_VARS touch xmm13 in 64-bit builds and filter_block1d16_v* uses one extra temp variable Change-Id: I9c768c0b2047481d1d3b11c2e16b2f8de6eb0d80
This commit is contained in:
parent
a3df343cda
commit
683b5a3161
@ -104,7 +104,7 @@ cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
|
||||
%define k0k1k4k5 m8
|
||||
%define k2k3k6k7 m9
|
||||
%define krd m10
|
||||
%define orig_height r7
|
||||
%define orig_height r7d
|
||||
mova krd, [GLOBAL(pw_64)]
|
||||
pshuflw k0k1k4k5, m4, 0b ;k0_k1
|
||||
pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
|
||||
@ -131,8 +131,8 @@ cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
|
||||
mova k2k3k6k7, m7
|
||||
mova krd, m1
|
||||
%endif
|
||||
mov orig_height, heightq
|
||||
shr heightq, 1
|
||||
mov orig_height, heightd
|
||||
shr heightd, 1
|
||||
.loop:
|
||||
;Do two rows at once
|
||||
movh m0, [srcq - 3]
|
||||
@ -200,12 +200,12 @@ cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
|
||||
lea dstq, [dstq + 2 * dstrideq ]
|
||||
prefetcht0 [srcq + 2 * sstrideq - 3]
|
||||
|
||||
dec heightq
|
||||
dec heightd
|
||||
jnz .loop
|
||||
|
||||
; Do last row if output_height is odd
|
||||
mov heightq, orig_height
|
||||
and heightq, 1
|
||||
mov heightd, orig_height
|
||||
and heightd, 1
|
||||
je .done
|
||||
|
||||
movh m0, [srcq - 3] ; load src
|
||||
@ -254,17 +254,17 @@ cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
|
||||
|
||||
;-------------------------------------------------------------------------------
|
||||
%macro SUBPIX_HFILTER8 1
|
||||
cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 13, LOCAL_VARS_SIZE, \
|
||||
cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 14, LOCAL_VARS_SIZE, \
|
||||
src, sstride, dst, dstride, height, filter
|
||||
mova m4, [filterq]
|
||||
SETUP_LOCAL_VARS
|
||||
%if ARCH_X86_64
|
||||
%define orig_height r7
|
||||
%define orig_height r7d
|
||||
%else
|
||||
%define orig_height heightmp
|
||||
%endif
|
||||
mov orig_height, heightq
|
||||
shr heightq, 1
|
||||
mov orig_height, heightd
|
||||
shr heightd, 1
|
||||
|
||||
.loop:
|
||||
movh m0, [srcq - 3]
|
||||
@ -336,12 +336,12 @@ cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 13, LOCAL_VARS_SIZE, \
|
||||
lea srcq, [srcq + sstrideq ]
|
||||
lea dstq, [dstq + 2 * dstrideq ]
|
||||
prefetcht0 [srcq + 2 * sstrideq - 3]
|
||||
dec heightq
|
||||
dec heightd
|
||||
jnz .loop
|
||||
|
||||
;Do last row if output_height is odd
|
||||
mov heightq, orig_height
|
||||
and heightq, 1
|
||||
mov heightd, orig_height
|
||||
and heightd, 1
|
||||
je .done
|
||||
|
||||
movh m0, [srcq - 3]
|
||||
@ -361,7 +361,7 @@ cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 13, LOCAL_VARS_SIZE, \
|
||||
|
||||
;-------------------------------------------------------------------------------
|
||||
%macro SUBPIX_HFILTER16 1
|
||||
cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 13, LOCAL_VARS_SIZE, \
|
||||
cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \
|
||||
src, sstride, dst, dstride, height, filter
|
||||
mova m4, [filterq]
|
||||
SETUP_LOCAL_VARS
|
||||
@ -427,7 +427,7 @@ cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 13, LOCAL_VARS_SIZE, \
|
||||
lea srcq, [srcq + sstrideq]
|
||||
mova [dstq], m0
|
||||
lea dstq, [dstq + dstrideq]
|
||||
dec heightq
|
||||
dec heightd
|
||||
jnz .loop
|
||||
RET
|
||||
%endm
|
||||
@ -527,11 +527,11 @@ cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
|
||||
%endif
|
||||
movx [dstq], m1
|
||||
add dstq, dst_stride
|
||||
sub heightq, 2
|
||||
cmp heightq, 1
|
||||
sub heightd, 2
|
||||
cmp heightd, 1
|
||||
jg .loop
|
||||
|
||||
cmp heightq, 0
|
||||
cmp heightd, 0
|
||||
je .done
|
||||
|
||||
movx m0, [srcq ] ;A
|
||||
@ -570,7 +570,7 @@ cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
|
||||
|
||||
;-------------------------------------------------------------------------------
|
||||
%macro SUBPIX_VFILTER16 1
|
||||
cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*2), 13, LOCAL_VARS_SIZE, \
|
||||
cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
|
||||
src, sstride, dst, dstride, height, filter
|
||||
|
||||
mova m4, [filterq]
|
||||
@ -655,7 +655,7 @@ cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*2), 13, LOCAL_VARS_SIZE, \
|
||||
%endif
|
||||
movh [dstq + 8], m3
|
||||
add dstq, dst_stride
|
||||
dec heightq
|
||||
dec heightd
|
||||
jnz .loop
|
||||
RET
|
||||
%endm
|
||||
|
Loading…
x
Reference in New Issue
Block a user