vp9/x86: 16px MC functions (64bit only).

Cycle counts for large MCs (old -> new on ped1080p.webm, mx!=0&&my!=0):
16x8:    876 ->   870  (0.7%)
16x16:  1444 ->  1435  (0.7%)
16x32:  2784 ->  2748  (1.3%)
32x16:  2455 ->  2349  (4.5%)
32x32:  4641 ->  4084 (13.6%)
32x64:  9200 ->  7834 (17.4%)
64x32:  8980 ->  7197 (24.8%)
64x64: 17330 -> 13796 (25.6%)
Total decoding time goes from 9.326sec to 9.182sec.
This commit is contained in:
Ronald S. Bultje 2013-12-24 16:17:03 -05:00
parent 0d9375fc90
commit 18175baa54
2 changed files with 127 additions and 0 deletions

View File

@ -56,6 +56,9 @@ mc_func(avg, sz, v, ssse3)
mc_funcs(4);
mc_funcs(8);
#if ARCH_X86_64
mc_funcs(16);
#endif
#undef mc_funcs
#undef mc_func
@ -78,7 +81,9 @@ mc_rep_func(avg, sz, hsz, h, ssse3); \
mc_rep_func(put, sz, hsz, v, ssse3); \
mc_rep_func(avg, sz, hsz, v, ssse3)
#if ARCH_X86_32
mc_rep_funcs(16, 8);
#endif
mc_rep_funcs(32, 16);
mc_rep_funcs(64, 32);

View File

@ -145,6 +145,62 @@ INIT_XMM ssse3
filter_h_fn put
filter_h_fn avg
%if ARCH_X86_64
%macro filter_hx2_fn 1
%assign %%px mmsize
cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, dstride, src, sstride, h, filtery
mova m13, [pw_256]
mova m8, [filteryq+ 0]
mova m9, [filteryq+16]
mova m10, [filteryq+32]
mova m11, [filteryq+48]
.loop:
movu m0, [srcq-3]
movu m1, [srcq-2]
movu m2, [srcq-1]
movu m3, [srcq+0]
movu m4, [srcq+1]
movu m5, [srcq+2]
movu m6, [srcq+3]
movu m7, [srcq+4]
add srcq, sstrideq
SBUTTERFLY bw, 0, 1, 12
SBUTTERFLY bw, 2, 3, 12
SBUTTERFLY bw, 4, 5, 12
SBUTTERFLY bw, 6, 7, 12
pmaddubsw m0, m8
pmaddubsw m1, m8
pmaddubsw m2, m9
pmaddubsw m3, m9
pmaddubsw m4, m10
pmaddubsw m5, m10
pmaddubsw m6, m11
pmaddubsw m7, m11
paddw m0, m2
paddw m1, m3
paddw m4, m6
paddw m5, m7
paddsw m0, m4
paddsw m1, m5
pmulhrsw m0, m13
pmulhrsw m1, m13
packuswb m0, m1
%ifidn %1, avg
pavgb m0, [dstq]
%endif
mova [dstq], m0
add dstq, dstrideq
dec hd
jg .loop
RET
%endmacro
INIT_XMM ssse3
filter_hx2_fn put
filter_hx2_fn avg
%endif ; ARCH_X86_64
%macro filter_v_fn 1
%assign %%px mmsize/2
%if ARCH_X86_64
@ -220,6 +276,72 @@ INIT_XMM ssse3
filter_v_fn put
filter_v_fn avg
%if ARCH_X86_64
%macro filter_vx2_fn 1
%assign %%px mmsize
cglobal %1_8tap_1d_v_ %+ %%px, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
sub srcq, sstrideq
lea sstride3q, [sstrideq*3]
sub srcq, sstrideq
mova m13, [pw_256]
sub srcq, sstrideq
mova m8, [filteryq+ 0]
lea src4q, [srcq+sstrideq*4]
mova m9, [filteryq+16]
mova m10, [filteryq+32]
mova m11, [filteryq+48]
.loop:
; FIXME maybe reuse loads from previous rows, or just
; more generally unroll this to prevent multiple loads of
; the same data?
movu m0, [srcq]
movu m1, [srcq+sstrideq]
movu m2, [srcq+sstrideq*2]
movu m3, [srcq+sstride3q]
movu m4, [src4q]
movu m5, [src4q+sstrideq]
movu m6, [src4q+sstrideq*2]
movu m7, [src4q+sstride3q]
add srcq, sstrideq
add src4q, sstrideq
SBUTTERFLY bw, 0, 1, 12
SBUTTERFLY bw, 2, 3, 12
SBUTTERFLY bw, 4, 5, 12
SBUTTERFLY bw, 6, 7, 12
pmaddubsw m0, m8
pmaddubsw m1, m8
pmaddubsw m2, m9
pmaddubsw m3, m9
pmaddubsw m4, m10
pmaddubsw m5, m10
pmaddubsw m6, m11
pmaddubsw m7, m11
paddw m0, m2
paddw m1, m3
paddw m4, m6
paddw m5, m7
paddsw m0, m4
paddsw m1, m5
pmulhrsw m0, m13
pmulhrsw m1, m13
packuswb m0, m1
%ifidn %1, avg
pavgb m0, [dstq]
%endif
mova [dstq], m0
add dstq, dstrideq
dec hd
jg .loop
RET
%endmacro
INIT_XMM ssse3
filter_vx2_fn put
filter_vx2_fn avg
%endif ; ARCH_X86_64
%macro fpel_fn 6
%if %2 == 4
%define %%srcfn movh