diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index 2bebdcbd9..1746be28b 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -162,6 +162,10 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest, #else INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest, ::testing::Values( + make_tuple(&vpx_highbd_dc_predictor_32x32_sse2, + &vpx_highbd_dc_predictor_32x32_c, 32, 8), + make_tuple(&vpx_highbd_tm_predictor_16x16_sse2, + &vpx_highbd_tm_predictor_16x16_c, 16, 8), make_tuple(&vpx_highbd_dc_predictor_4x4_sse2, &vpx_highbd_dc_predictor_4x4_c, 4, 8), make_tuple(&vpx_highbd_dc_predictor_8x8_sse2, @@ -218,6 +222,12 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest, #else INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest, ::testing::Values( + make_tuple(&vpx_highbd_dc_predictor_32x32_sse2, + &vpx_highbd_dc_predictor_32x32_c, 32, + 10), + make_tuple(&vpx_highbd_tm_predictor_16x16_sse2, + &vpx_highbd_tm_predictor_16x16_c, 16, + 10), make_tuple(&vpx_highbd_dc_predictor_4x4_sse2, &vpx_highbd_dc_predictor_4x4_c, 4, 10), make_tuple(&vpx_highbd_dc_predictor_8x8_sse2, @@ -275,6 +285,12 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest, #else INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest, ::testing::Values( + make_tuple(&vpx_highbd_dc_predictor_32x32_sse2, + &vpx_highbd_dc_predictor_32x32_c, 32, + 12), + make_tuple(&vpx_highbd_tm_predictor_16x16_sse2, + &vpx_highbd_tm_predictor_16x16_c, 16, + 12), make_tuple(&vpx_highbd_dc_predictor_4x4_sse2, &vpx_highbd_dc_predictor_4x4_c, 4, 12), make_tuple(&vpx_highbd_dc_predictor_8x8_sse2, diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index a2a067457..84edc9eda 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -387,7 +387,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_v_predictor_16x16/, "$sse2_x86inc"; add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_tm_predictor_16x16/, "$sse2_x86_64_x86inc"; + specialize qw/vpx_highbd_tm_predictor_16x16/, "$sse2_x86inc"; add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_predictor_16x16/, "$sse2_x86inc"; @@ -438,7 +438,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_tm_predictor_32x32/, "$sse2_x86_64_x86inc"; add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_predictor_32x32/, "$sse2_x86_64_x86inc"; + specialize qw/vpx_highbd_dc_predictor_32x32/, "$sse2_x86inc"; add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_top_predictor_32x32/; diff --git a/vpx_dsp/x86/highbd_intrapred_sse2.asm b/vpx_dsp/x86/highbd_intrapred_sse2.asm index 233958a52..f46ffec23 100644 --- a/vpx_dsp/x86/highbd_intrapred_sse2.asm +++ b/vpx_dsp/x86/highbd_intrapred_sse2.asm @@ -118,30 +118,29 @@ cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset RESTORE_GOT REP_RET -%if ARCH_X86_64 INIT_XMM sse2 -cglobal highbd_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset +cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset GET_GOT goffsetq - pxor m1, m1 mova m0, [aboveq] mova m2, [aboveq+16] mova m3, [aboveq+32] mova m4, [aboveq+48] - mova m5, [leftq] - mova m6, [leftq+16] - mova m7, [leftq+32] - mova m8, [leftq+48] + paddw m0, m2 + paddw m3, m4 + mova m2, [leftq] + mova m4, [leftq+16] + mova m5, [leftq+32] + mova m6, [leftq+48] + paddw m2, m4 + paddw m5, m6 + paddw m0, m3 + paddw m2, m5 + pxor m1, m1 + paddw m0, m2 DEFINE_ARGS dst, stride, stride3, lines4 lea stride3q, [strideq*3] mov lines4d, 8 - paddw m0, m2 - paddw m0, m3 - paddw m0, m4 - paddw m0, m5 - paddw m0, m6 - paddw m0, m7 - paddw m0, m8 movhlps m2, m0 paddw m0, m2 punpcklwd m0, m1 @@ -177,7 +176,6 @@ cglobal highbd_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset RESTORE_GOT REP_RET -%endif INIT_XMM sse2 cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above @@ -340,61 +338,54 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one jnz .loop REP_RET -%if ARCH_X86_64 INIT_XMM sse2 -cglobal highbd_tm_predictor_16x16, 5, 6, 9, dst, stride, above, left, bps, one +cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps movd m2, [aboveq-2] mova m0, [aboveq] mova m1, [aboveq+16] pshuflw m2, m2, 0x0 ; Get the values to compute the maximum value at this bit depth - mov oned, 1 - pxor m7, m7 - pxor m8, m8 - pinsrw m7, oned, 0 - pinsrw m8, bpsd, 0 - pshuflw m7, m7, 0x0 - DEFINE_ARGS dst, stride, line, left - punpcklqdq m7, m7 - mov lineq, -8 - mova m5, m7 + pcmpeqw m3, m3 + movd m4, bpsd punpcklqdq m2, m2 - psllw m7, m8 - add leftq, 32 - psubw m7, m5 ; max possible value - pxor m8, m8 ; min possible value + psllw m3, m4 + pcmpeqw m5, m5 + pxor m4, m4 ; min possible value + pxor m3, m5 ; max possible value + DEFINE_ARGS dst, stride, line, left + mov lineq, -8 psubw m0, m2 psubw m1, m2 .loop: - movd m2, [leftq+lineq*4] - movd m3, [leftq+lineq*4+2] - pshuflw m2, m2, 0x0 - pshuflw m3, m3, 0x0 - punpcklqdq m2, m2 - punpcklqdq m3, m3 - paddw m4, m2, m0 - paddw m5, m3, m0 + movd m7, [leftq] + pshuflw m5, m7, 0x0 + pshuflw m2, m7, 0x55 + punpcklqdq m5, m5 ; l1 l1 l1 l1 l1 l1 l1 l1 + punpcklqdq m2, m2 ; l2 l2 l2 l2 l2 l2 l2 l2 + paddw m6, m5, m0 ; t1-tl+l1 to t4-tl+l1 + paddw m5, m1 ; t5-tl+l1 to t8-tl+l1 + pminsw m6, m3 + pminsw m5, m3 + pmaxsw m6, m4 ; Clamp to the bit-depth + pmaxsw m5, m4 + mova [dstq ], m6 + mova [dstq +16], m5 + paddw m6, m2, m0 paddw m2, m1 - paddw m3, m1 - ;Clamp to the bit-depth - pminsw m4, m7 - pminsw m5, m7 - pminsw m2, m7 - pminsw m3, m7 - pmaxsw m4, m8 - pmaxsw m5, m8 - pmaxsw m2, m8 - pmaxsw m3, m8 - ;Store the values - mova [dstq ], m4 - mova [dstq+strideq*2 ], m5 - mova [dstq +16], m2 - mova [dstq+strideq*2+16], m3 + pminsw m6, m3 + pminsw m2, m3 + pmaxsw m6, m4 + pmaxsw m2, m4 + mova [dstq+strideq*2 ], m6 + mova [dstq+strideq*2+16], m2 lea dstq, [dstq+strideq*4] inc lineq + lea leftq, [leftq+4] + jnz .loop REP_RET +%if ARCH_X86_64 INIT_XMM sse2 cglobal highbd_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one movd m0, [aboveq-2]