vp9/x86: 16px MC functions (64bit only).
Cycle counts for large MCs (old -> new on ped1080p.webm, mx!=0&&my!=0): 16x8: 876 -> 870 (0.7%) 16x16: 1444 -> 1435 (0.7%) 16x32: 2784 -> 2748 (1.3%) 32x16: 2455 -> 2349 (4.5%) 32x32: 4641 -> 4084 (13.6%) 32x64: 9200 -> 7834 (17.4%) 64x32: 8980 -> 7197 (24.8%) 64x64: 17330 -> 13796 (25.6%) Total decoding time goes from 9.326sec to 9.182sec.
This commit is contained in:
parent
0d9375fc90
commit
18175baa54
@ -56,6 +56,9 @@ mc_func(avg, sz, v, ssse3)
|
||||
|
||||
mc_funcs(4);
|
||||
mc_funcs(8);
|
||||
#if ARCH_X86_64
|
||||
mc_funcs(16);
|
||||
#endif
|
||||
|
||||
#undef mc_funcs
|
||||
#undef mc_func
|
||||
@ -78,7 +81,9 @@ mc_rep_func(avg, sz, hsz, h, ssse3); \
|
||||
mc_rep_func(put, sz, hsz, v, ssse3); \
|
||||
mc_rep_func(avg, sz, hsz, v, ssse3)
|
||||
|
||||
#if ARCH_X86_32
|
||||
mc_rep_funcs(16, 8);
|
||||
#endif
|
||||
mc_rep_funcs(32, 16);
|
||||
mc_rep_funcs(64, 32);
|
||||
|
||||
|
@ -145,6 +145,62 @@ INIT_XMM ssse3
|
||||
filter_h_fn put
|
||||
filter_h_fn avg
|
||||
|
||||
%if ARCH_X86_64
|
||||
%macro filter_hx2_fn 1
|
||||
%assign %%px mmsize
|
||||
cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, dstride, src, sstride, h, filtery
|
||||
mova m13, [pw_256]
|
||||
mova m8, [filteryq+ 0]
|
||||
mova m9, [filteryq+16]
|
||||
mova m10, [filteryq+32]
|
||||
mova m11, [filteryq+48]
|
||||
.loop:
|
||||
movu m0, [srcq-3]
|
||||
movu m1, [srcq-2]
|
||||
movu m2, [srcq-1]
|
||||
movu m3, [srcq+0]
|
||||
movu m4, [srcq+1]
|
||||
movu m5, [srcq+2]
|
||||
movu m6, [srcq+3]
|
||||
movu m7, [srcq+4]
|
||||
add srcq, sstrideq
|
||||
SBUTTERFLY bw, 0, 1, 12
|
||||
SBUTTERFLY bw, 2, 3, 12
|
||||
SBUTTERFLY bw, 4, 5, 12
|
||||
SBUTTERFLY bw, 6, 7, 12
|
||||
pmaddubsw m0, m8
|
||||
pmaddubsw m1, m8
|
||||
pmaddubsw m2, m9
|
||||
pmaddubsw m3, m9
|
||||
pmaddubsw m4, m10
|
||||
pmaddubsw m5, m10
|
||||
pmaddubsw m6, m11
|
||||
pmaddubsw m7, m11
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
paddw m4, m6
|
||||
paddw m5, m7
|
||||
paddsw m0, m4
|
||||
paddsw m1, m5
|
||||
pmulhrsw m0, m13
|
||||
pmulhrsw m1, m13
|
||||
packuswb m0, m1
|
||||
%ifidn %1, avg
|
||||
pavgb m0, [dstq]
|
||||
%endif
|
||||
mova [dstq], m0
|
||||
add dstq, dstrideq
|
||||
dec hd
|
||||
jg .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
filter_hx2_fn put
|
||||
filter_hx2_fn avg
|
||||
|
||||
%endif ; ARCH_X86_64
|
||||
|
||||
%macro filter_v_fn 1
|
||||
%assign %%px mmsize/2
|
||||
%if ARCH_X86_64
|
||||
@ -220,6 +276,72 @@ INIT_XMM ssse3
|
||||
filter_v_fn put
|
||||
filter_v_fn avg
|
||||
|
||||
%if ARCH_X86_64
|
||||
|
||||
%macro filter_vx2_fn 1
|
||||
%assign %%px mmsize
|
||||
cglobal %1_8tap_1d_v_ %+ %%px, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
|
||||
sub srcq, sstrideq
|
||||
lea sstride3q, [sstrideq*3]
|
||||
sub srcq, sstrideq
|
||||
mova m13, [pw_256]
|
||||
sub srcq, sstrideq
|
||||
mova m8, [filteryq+ 0]
|
||||
lea src4q, [srcq+sstrideq*4]
|
||||
mova m9, [filteryq+16]
|
||||
mova m10, [filteryq+32]
|
||||
mova m11, [filteryq+48]
|
||||
.loop:
|
||||
; FIXME maybe reuse loads from previous rows, or just
|
||||
; more generally unroll this to prevent multiple loads of
|
||||
; the same data?
|
||||
movu m0, [srcq]
|
||||
movu m1, [srcq+sstrideq]
|
||||
movu m2, [srcq+sstrideq*2]
|
||||
movu m3, [srcq+sstride3q]
|
||||
movu m4, [src4q]
|
||||
movu m5, [src4q+sstrideq]
|
||||
movu m6, [src4q+sstrideq*2]
|
||||
movu m7, [src4q+sstride3q]
|
||||
add srcq, sstrideq
|
||||
add src4q, sstrideq
|
||||
SBUTTERFLY bw, 0, 1, 12
|
||||
SBUTTERFLY bw, 2, 3, 12
|
||||
SBUTTERFLY bw, 4, 5, 12
|
||||
SBUTTERFLY bw, 6, 7, 12
|
||||
pmaddubsw m0, m8
|
||||
pmaddubsw m1, m8
|
||||
pmaddubsw m2, m9
|
||||
pmaddubsw m3, m9
|
||||
pmaddubsw m4, m10
|
||||
pmaddubsw m5, m10
|
||||
pmaddubsw m6, m11
|
||||
pmaddubsw m7, m11
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
paddw m4, m6
|
||||
paddw m5, m7
|
||||
paddsw m0, m4
|
||||
paddsw m1, m5
|
||||
pmulhrsw m0, m13
|
||||
pmulhrsw m1, m13
|
||||
packuswb m0, m1
|
||||
%ifidn %1, avg
|
||||
pavgb m0, [dstq]
|
||||
%endif
|
||||
mova [dstq], m0
|
||||
add dstq, dstrideq
|
||||
dec hd
|
||||
jg .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
filter_vx2_fn put
|
||||
filter_vx2_fn avg
|
||||
|
||||
%endif ; ARCH_X86_64
|
||||
|
||||
%macro fpel_fn 6
|
||||
%if %2 == 4
|
||||
%define %%srcfn movh
|
||||
|
Loading…
Reference in New Issue
Block a user