Code clean of tm_predictor_32x32

Reallocate the xmm register usage so that no ARCH_X86_64 required.
Reduce memory access to the left neighbor by half.
Speed up by single digit on big core machine.

Change-Id: I392515ed8e8aeb02e6a717b3966b1ba13f5be990
This commit is contained in:
Jian Zhou 2015-12-10 17:25:18 -08:00
parent 62f986265f
commit 88120481a4
3 changed files with 21 additions and 33 deletions

View File

@ -337,21 +337,12 @@ INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c,
vpx_d63_predictor_32x32_c, vpx_tm_predictor_32x32_c)
#if HAVE_SSE2 && CONFIG_USE_X86INC
#if ARCH_X86_64
INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
vpx_dc_left_predictor_32x32_sse2,
vpx_dc_top_predictor_32x32_sse2,
vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL,
NULL, vpx_tm_predictor_32x32_sse2)
#else
INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
vpx_dc_left_predictor_32x32_sse2,
vpx_dc_top_predictor_32x32_sse2,
vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL,
NULL, NULL)
#endif // ARCH_X86_64
#endif // HAVE_SSE2 && CONFIG_USE_X86INC
#if HAVE_SSSE3 && CONFIG_USE_X86INC

View File

@ -241,7 +241,7 @@ add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, con
specialize qw/vpx_v_predictor_32x32 neon msa/, "$sse2_x86inc";
add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_tm_predictor_32x32 neon msa/, "$sse2_x86_64_x86inc";
specialize qw/vpx_tm_predictor_32x32 neon msa/, "$sse2_x86inc";
add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_dc_predictor_32x32 msa neon/, "$sse2_x86inc";

View File

@ -699,9 +699,8 @@ cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left
jnz .loop
REP_RET
%if ARCH_X86_64
INIT_XMM sse2
cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left
cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left
pxor m1, m1
movd m2, [aboveq-1]
mova m0, [aboveq]
@ -722,31 +721,29 @@ cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left
psubw m5, m2
.loop:
movd m2, [leftq+lineq*2]
movd m6, [leftq+lineq*2+1]
pxor m1, m1
punpcklbw m2, m1
punpcklbw m6, m1
pshuflw m7, m2, 0x55
pshuflw m2, m2, 0x0
pshuflw m6, m6, 0x0
punpcklqdq m2, m2
punpcklqdq m6, m6
paddw m7, m2, m0
paddw m8, m2, m3
paddw m9, m2, m4
paddw m2, m5
packuswb m7, m8
packuswb m9, m2
paddw m2, m6, m0
paddw m8, m6, m3
mova [dstq ], m7
paddw m7, m6, m4
paddw m6, m5
mova [dstq +16], m9
packuswb m2, m8
packuswb m7, m6
mova [dstq+strideq ], m2
mova [dstq+strideq+16], m7
punpcklqdq m7, m7
paddw m6, m2, m3
paddw m1, m2, m0
packuswb m1, m6
mova [dstq ], m1
paddw m6, m2, m5
paddw m1, m2, m4
packuswb m1, m6
mova [dstq+16 ], m1
paddw m6, m7, m3
paddw m1, m7, m0
packuswb m1, m6
mova [dstq+strideq ], m1
paddw m6, m7, m5
paddw m1, m7, m4
packuswb m1, m6
mova [dstq+strideq+16], m1
lea dstq, [dstq+strideq*2]
inc lineq
jnz .loop
REP_RET
%endif