Slow pshufb removal in 3 intra prediction functions.
Replaced vpx_d45_predictor_4x4_ssse3(), vpx_d45_predictor_8x8_ssse3() and vpx_d207_predictor_4x4_ssse3() with created vpx_d45_predictor_4x4_sse2(), vpx_d45_predictor_8x8_sse2() and vpx_d207_predictor_4x4_sse2() respectively. It's mostly neutral or slightly worse than ssse3 in good cases and better than ssse3 in the bad cases (but still worse than using the mmx regs). Change-Id: Ib0237ceb71d2c57b8a93fd3170330cfed9d56bdd
This commit is contained in:
parent
6f397b8a5b
commit
ad0646cb84
@ -191,14 +191,15 @@ INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c,
|
||||
INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
|
||||
vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
|
||||
vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
|
||||
vpx_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
|
||||
vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL,
|
||||
NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL,
|
||||
vpx_tm_predictor_4x4_sse2)
|
||||
#endif // HAVE_SSE2 && CONFIG_USE_X86INC
|
||||
|
||||
#if HAVE_SSSE3 && CONFIG_USE_X86INC
|
||||
INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
|
||||
NULL, vpx_d45_predictor_4x4_ssse3, NULL, NULL,
|
||||
vpx_d153_predictor_4x4_ssse3, vpx_d207_predictor_4x4_ssse3,
|
||||
NULL, NULL, NULL, NULL,
|
||||
vpx_d153_predictor_4x4_ssse3, NULL,
|
||||
vpx_d63_predictor_4x4_ssse3, NULL)
|
||||
#endif // HAVE_SSSE3 && CONFIG_USE_X86INC
|
||||
|
||||
@ -240,13 +241,13 @@ INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c,
|
||||
INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2,
|
||||
vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2,
|
||||
vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2,
|
||||
vpx_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL, NULL,
|
||||
NULL, vpx_tm_predictor_8x8_sse2)
|
||||
vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL,
|
||||
NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2)
|
||||
#endif // HAVE_SSE2 && CONFIG_USE_X86INC
|
||||
|
||||
#if HAVE_SSSE3 && CONFIG_USE_X86INC
|
||||
INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL,
|
||||
NULL, vpx_d45_predictor_8x8_ssse3, NULL, NULL,
|
||||
NULL, NULL, NULL, NULL,
|
||||
vpx_d153_predictor_8x8_ssse3, vpx_d207_predictor_8x8_ssse3,
|
||||
vpx_d63_predictor_8x8_ssse3, NULL)
|
||||
#endif // HAVE_SSSE3 && CONFIG_USE_X86INC
|
||||
|
@ -55,13 +55,13 @@ if ($opts{arch} eq "x86_64") {
|
||||
#
|
||||
|
||||
add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
|
||||
specialize qw/vpx_d207_predictor_4x4/, "$ssse3_x86inc";
|
||||
specialize qw/vpx_d207_predictor_4x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/void vpx_d207e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
|
||||
specialize qw/vpx_d207e_predictor_4x4/;
|
||||
|
||||
add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
|
||||
specialize qw/vpx_d45_predictor_4x4 neon/, "$ssse3_x86inc";
|
||||
specialize qw/vpx_d45_predictor_4x4 neon/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
|
||||
specialize qw/vpx_d45e_predictor_4x4/;
|
||||
@ -118,7 +118,7 @@ add_proto qw/void vpx_d207e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, c
|
||||
specialize qw/vpx_d207e_predictor_8x8/;
|
||||
|
||||
add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
|
||||
specialize qw/vpx_d45_predictor_8x8 neon/, "$ssse3_x86inc";
|
||||
specialize qw/vpx_d45_predictor_8x8 neon/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/void vpx_d45e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
|
||||
specialize qw/vpx_d45e_predictor_8x8/;
|
||||
|
@ -11,6 +11,7 @@
|
||||
%include "third_party/x86inc/x86inc.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
pb_1: times 16 db 1
|
||||
pw_4: times 8 dw 4
|
||||
pw_8: times 8 dw 8
|
||||
pw_16: times 8 dw 16
|
||||
@ -23,6 +24,115 @@ pw2_32: times 8 dw 16
|
||||
|
||||
SECTION .text
|
||||
|
||||
; ------------------------------------------
|
||||
; input: x, y, z, result
|
||||
;
|
||||
; trick from pascal
|
||||
; (x+2y+z+2)>>2 can be calculated as:
|
||||
; result = avg(x,z)
|
||||
; result -= xor(x,z) & 1
|
||||
; result = avg(result,y)
|
||||
; ------------------------------------------
|
||||
%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
|
||||
pavgb %4, %1, %3
|
||||
pxor %3, %1
|
||||
pand %3, [GLOBAL(pb_1)]
|
||||
psubb %4, %3
|
||||
pavgb %4, %2
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
|
||||
GET_GOT goffsetq
|
||||
|
||||
movq m0, [aboveq]
|
||||
DEFINE_ARGS dst, stride, temp
|
||||
psrldq m1, m0, 1
|
||||
psrldq m2, m0, 2
|
||||
X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
|
||||
|
||||
; store 4 lines
|
||||
movd [dstq ], m3
|
||||
psrlq m3, 8
|
||||
movd [dstq+strideq ], m3
|
||||
lea dstq, [dstq+strideq*2]
|
||||
psrlq m3, 8
|
||||
movd [dstq ], m3
|
||||
psrlq m3, 8
|
||||
movd [dstq+strideq ], m3
|
||||
psrlq m0, 56
|
||||
movd tempq, m0
|
||||
mov [dstq+strideq+3], tempb
|
||||
|
||||
RESTORE_GOT
|
||||
RET
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
|
||||
GET_GOT goffsetq
|
||||
|
||||
movu m1, [aboveq]
|
||||
pslldq m0, m1, 1
|
||||
psrldq m2, m1, 1
|
||||
DEFINE_ARGS dst, stride, stride3
|
||||
lea stride3q, [strideq*3]
|
||||
X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
|
||||
punpckhbw m0, m0 ; 7 7
|
||||
punpcklwd m0, m0 ; 7 7 7 7
|
||||
punpckldq m0, m0 ; 7 7 7 7 7 7 7 7
|
||||
punpcklqdq m3, m0 ; -1 0 1 2 3 4 5 6 7 7 7 7 7 7 7 7
|
||||
|
||||
; store 4 lines
|
||||
psrldq m3, 1
|
||||
movq [dstq ], m3
|
||||
psrldq m3, 1
|
||||
movq [dstq+strideq ], m3
|
||||
psrldq m3, 1
|
||||
movq [dstq+strideq*2], m3
|
||||
psrldq m3, 1
|
||||
movq [dstq+stride3q ], m3
|
||||
lea dstq, [dstq+strideq*4]
|
||||
|
||||
; store next 4 lines
|
||||
psrldq m3, 1
|
||||
movq [dstq ], m3
|
||||
psrldq m3, 1
|
||||
movq [dstq+strideq ], m3
|
||||
psrldq m3, 1
|
||||
movq [dstq+strideq*2], m3
|
||||
psrldq m3, 1
|
||||
movq [dstq+stride3q ], m3
|
||||
|
||||
RESTORE_GOT
|
||||
RET
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal d207_predictor_4x4, 4, 4, 5, dst, stride, unused, left, goffset
|
||||
GET_GOT goffsetq
|
||||
|
||||
movd m0, [leftq] ; abcd [byte]
|
||||
punpcklbw m4, m0, m0 ; aabb ccdd
|
||||
punpcklwd m4, m4 ; aaaa bbbb cccc dddd
|
||||
psrldq m4, 12 ; dddd
|
||||
punpckldq m0, m4 ; abcd dddd
|
||||
psrldq m1, m0, 1 ; bcdd
|
||||
psrldq m2, m0, 2 ; cddd
|
||||
|
||||
X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; a2bc b2cd c3d d
|
||||
pavgb m1, m0 ; ab, bc, cd, d [byte]
|
||||
|
||||
punpcklbw m1, m3 ; ab, a2bc, bc, b2cd, cd, c3d, d, d
|
||||
movd [dstq ], m1
|
||||
psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d
|
||||
movd [dstq+strideq], m1
|
||||
|
||||
lea dstq, [dstq+strideq*2]
|
||||
psrlq m1, 16 ; cd, c3d, d, d
|
||||
movd [dstq ], m1
|
||||
movd [dstq+strideq], m4 ; d, d, d, d
|
||||
RESTORE_GOT
|
||||
RET
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
|
||||
GET_GOT goffsetq
|
||||
|
@ -13,7 +13,6 @@
|
||||
SECTION_RODATA
|
||||
|
||||
pb_1: times 16 db 1
|
||||
sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
|
||||
@ -28,77 +27,9 @@ sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
||||
sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
|
||||
SECTION .text
|
||||
|
||||
INIT_MMX ssse3
|
||||
cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
|
||||
GET_GOT goffsetq
|
||||
|
||||
movq m0, [aboveq]
|
||||
pshufb m2, m0, [GLOBAL(sh_b23456777)]
|
||||
pshufb m1, m0, [GLOBAL(sh_b01234577)]
|
||||
pshufb m0, [GLOBAL(sh_b12345677)]
|
||||
pavgb m3, m2, m1
|
||||
pxor m2, m1
|
||||
pand m2, [GLOBAL(pb_1)]
|
||||
psubb m3, m2
|
||||
pavgb m0, m3
|
||||
|
||||
; store 4 lines
|
||||
movd [dstq ], m0
|
||||
psrlq m0, 8
|
||||
movd [dstq+strideq], m0
|
||||
lea dstq, [dstq+strideq*2]
|
||||
psrlq m0, 8
|
||||
movd [dstq ], m0
|
||||
psrlq m0, 8
|
||||
movd [dstq+strideq], m0
|
||||
|
||||
RESTORE_GOT
|
||||
RET
|
||||
|
||||
INIT_MMX ssse3
|
||||
cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
|
||||
GET_GOT goffsetq
|
||||
|
||||
movq m0, [aboveq]
|
||||
mova m1, [GLOBAL(sh_b12345677)]
|
||||
DEFINE_ARGS dst, stride, stride3
|
||||
lea stride3q, [strideq*3]
|
||||
pshufb m2, m0, [GLOBAL(sh_b23456777)]
|
||||
pavgb m3, m2, m0
|
||||
pxor m2, m0
|
||||
pshufb m0, m1
|
||||
pand m2, [GLOBAL(pb_1)]
|
||||
psubb m3, m2
|
||||
pavgb m0, m3
|
||||
|
||||
; store 4 lines
|
||||
movq [dstq ], m0
|
||||
pshufb m0, m1
|
||||
movq [dstq+strideq ], m0
|
||||
pshufb m0, m1
|
||||
movq [dstq+strideq*2], m0
|
||||
pshufb m0, m1
|
||||
movq [dstq+stride3q ], m0
|
||||
pshufb m0, m1
|
||||
lea dstq, [dstq+strideq*4]
|
||||
|
||||
; store next 4 lines
|
||||
movq [dstq ], m0
|
||||
pshufb m0, m1
|
||||
movq [dstq+strideq ], m0
|
||||
pshufb m0, m1
|
||||
movq [dstq+strideq*2], m0
|
||||
pshufb m0, m1
|
||||
movq [dstq+stride3q ], m0
|
||||
|
||||
RESTORE_GOT
|
||||
RET
|
||||
|
||||
INIT_XMM ssse3
|
||||
cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset
|
||||
GET_GOT goffsetq
|
||||
@ -715,28 +646,6 @@ cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
|
||||
RESTORE_GOT
|
||||
RET
|
||||
|
||||
INIT_MMX ssse3
|
||||
cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset
|
||||
GET_GOT goffsetq
|
||||
movd m0, [leftq] ; abcd [byte]
|
||||
pshufb m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte]
|
||||
pshufb m3, m0, [GLOBAL(sh_b2333)] ; cddd
|
||||
|
||||
X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2
|
||||
pavgb m1, m0 ; ab, bc, cd, d [byte]
|
||||
|
||||
punpcklbw m1, m2 ; ab, a2bc, bc, b2cd, cd, c3d, d, d
|
||||
movd [dstq ], m1
|
||||
psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d
|
||||
movd [dstq+strideq], m1
|
||||
lea dstq, [dstq+strideq*2]
|
||||
psrlq m1, 16 ; cd, c3d, d, d
|
||||
movd [dstq ], m1
|
||||
pshufw m1, m1, q1111 ; d, d, d, d
|
||||
movd [dstq+strideq], m1
|
||||
RESTORE_GOT
|
||||
RET
|
||||
|
||||
INIT_XMM ssse3
|
||||
cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset
|
||||
GET_GOT goffsetq
|
||||
|
Loading…
x
Reference in New Issue
Block a user