Merge "Code clean of sub_pixel_variance4xh"

This commit is contained in:
James Zern 2016-05-18 23:18:35 +00:00 committed by Gerrit Code Review
commit 146ccd304f
4 changed files with 158 additions and 60 deletions

View File

@ -1026,8 +1026,8 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse, 0),
make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse, 0)));
make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0),
make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0)));
INSTANTIATE_TEST_CASE_P(
SSE2, VpxSubpelAvgVarianceTest,
@ -1043,8 +1043,8 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse, 0),
make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse, 0)));
make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
#endif // CONFIG_USE_X86INC
#if CONFIG_VP9_HIGHBITDEPTH

View File

@ -1493,10 +1493,10 @@ add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int s
specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse_x86inc", "$ssse3_x86inc";
specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
@ -1532,10 +1532,10 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, i
specialize qw/vpx_sub_pixel_avg_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
#
# Specialty Subpixel

View File

@ -57,8 +57,8 @@ SECTION .text
paddd %6, %1
%endmacro
%macro STORE_AND_RET 0
%if mmsize == 16
%macro STORE_AND_RET 1
%if %1 > 4
; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
; We have to sign-extend it before adding the words within the register
@ -78,16 +78,16 @@ SECTION .text
movd [r1], m7 ; store sse
paddd m6, m4
movd raxd, m6 ; store sum as return value
%else ; mmsize == 8
pshufw m4, m6, 0xe
pshufw m3, m7, 0xe
%else ; 4xh
pshuflw m4, m6, 0xe
pshuflw m3, m7, 0xe
paddw m6, m4
paddd m7, m3
pcmpgtw m5, m6 ; mask for 0 > x
mov r1, ssem ; r1 = unsigned int *sse
punpcklwd m6, m5 ; sign-extend m6 word->dword
movd [r1], m7 ; store sse
pshufw m4, m6, 0xe
pshuflw m4, m6, 0xe
paddd m6, m4
movd raxd, m6 ; store sum as return value
%endif
@ -226,8 +226,14 @@ SECTION .text
punpckhbw m3, m1, m5
punpcklbw m1, m5
%endif
%if %1 > 4
punpckhbw m2, m0, m5
punpcklbw m0, m5
%else
punpcklbw m0, m5
movhlps m2, m0
%endif
%if %2 == 0 ; !avg
punpckhbw m3, m1, m5
punpcklbw m1, m5
@ -239,22 +245,40 @@ SECTION .text
%else ; %1 < 16
movh m0, [srcq]
%if %2 == 1 ; avg
%if mmsize == 16
%if %1 > 4
movhps m0, [srcq+src_strideq]
%else ; mmsize == 8
punpckldq m0, [srcq+src_strideq]
%else ; 4xh
movd m1, [srcq+src_strideq]
punpckldq m0, m1
%endif
%else ; !avg
movh m2, [srcq+src_strideq]
%endif
%if %1 > 4
movh m1, [dstq]
movh m3, [dstq+dst_strideq]
%else ; 4xh
movd m1, [dstq]
movd m3, [dstq+dst_strideq]
%endif
%if %2 == 1 ; avg
%if %1 > 4
pavgb m0, [secq]
%else
movh m2, [secq]
pavgb m0, m2
%endif
punpcklbw m3, m5
punpcklbw m1, m5
%if %1 > 4
punpckhbw m2, m0, m5
punpcklbw m0, m5
%else ; 4xh
punpcklbw m0, m5
movhlps m2, m0
%endif
%else ; !avg
punpcklbw m0, m5
punpcklbw m2, m5
@ -271,7 +295,7 @@ SECTION .text
%endif
dec block_height
jg .x_zero_y_zero_loop
STORE_AND_RET
STORE_AND_RET %1
.x_zero_y_nonzero:
cmp y_offsetd, 4
@ -299,9 +323,9 @@ SECTION .text
movh m0, [srcq]
movh m2, [srcq+src_strideq]
%if %2 == 1 ; avg
%if mmsize == 16
%if %1 > 4
movhps m2, [srcq+src_strideq*2]
%else ; mmsize == 8
%else ; 4xh
%if %1 == 4
movh m1, [srcq+src_strideq*2]
punpckldq m2, m1
@ -310,18 +334,26 @@ SECTION .text
%endif
%endif
movh m1, [dstq]
%if mmsize == 16
%if %1 > 4
movlhps m0, m2
%else ; mmsize == 8
%else ; 4xh
punpckldq m0, m2
%endif
movh m3, [dstq+dst_strideq]
pavgb m0, m2
punpcklbw m1, m5
%if %1 > 4
pavgb m0, [secq]
punpcklbw m3, m5
punpckhbw m2, m0, m5
punpcklbw m0, m5
%else ; 4xh
movh m4, [secq]
pavgb m0, m4
punpcklbw m3, m5
punpcklbw m0, m5
movhlps m2, m0
%endif
%else ; !avg
movh m4, [srcq+src_strideq*2]
movh m1, [dstq]
@ -343,7 +375,7 @@ SECTION .text
%endif
dec block_height
jg .x_zero_y_half_loop
STORE_AND_RET
STORE_AND_RET %1
.x_zero_y_nonhalf:
; x_offset == 0 && y_offset == bilin interpolation
@ -351,7 +383,7 @@ SECTION .text
lea bilin_filter, [bilin_filter_m]
%endif
shl y_offsetd, filter_idx_shift
%if ARCH_X86_64 && mmsize == 16
%if ARCH_X86_64 && %1 > 4
mova m8, [bilin_filter+y_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+y_offsetq+16]
@ -456,10 +488,20 @@ SECTION .text
psraw m2, 4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
%if %1 == 4
movlhps m0, m2
%endif
packuswb m0, m2
%if %1 > 4
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
%else ; 4xh
movh m2, [secq]
pavgb m0, m2
punpcklbw m0, m5
movhlps m2, m0
%endif
%endif
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
@ -475,7 +517,7 @@ SECTION .text
%undef filter_y_a
%undef filter_y_b
%undef filter_rnd
STORE_AND_RET
STORE_AND_RET %1
.x_nonzero:
cmp x_offsetd, 4
@ -506,21 +548,31 @@ SECTION .text
movh m0, [srcq]
movh m4, [srcq+1]
%if %2 == 1 ; avg
%if mmsize == 16
%if %1 > 4
movhps m0, [srcq+src_strideq]
movhps m4, [srcq+src_strideq+1]
%else ; mmsize == 8
punpckldq m0, [srcq+src_strideq]
punpckldq m4, [srcq+src_strideq+1]
%else ; 4xh
movd m1, [srcq+src_strideq]
punpckldq m0, m1
movd m2, [srcq+src_strideq+1]
punpckldq m4, m2
%endif
movh m1, [dstq]
movh m3, [dstq+dst_strideq]
pavgb m0, m4
punpcklbw m3, m5
%if %1 > 4
pavgb m0, [secq]
punpcklbw m1, m5
punpckhbw m2, m0, m5
punpcklbw m0, m5
%else ; 4xh
movh m2, [secq]
pavgb m0, m2
punpcklbw m1, m5
punpcklbw m0, m5
movhlps m2, m0
%endif
%else ; !avg
movh m2, [srcq+src_strideq]
movh m1, [dstq]
@ -543,7 +595,7 @@ SECTION .text
%endif
dec block_height
jg .x_half_y_zero_loop
STORE_AND_RET
STORE_AND_RET %1
.x_half_y_nonzero:
cmp y_offsetd, 4
@ -586,7 +638,7 @@ SECTION .text
movh m2, [srcq]
movh m3, [srcq+1]
%if %2 == 1 ; avg
%if mmsize == 16
%if %1 > 4
movhps m2, [srcq+src_strideq]
movhps m3, [srcq+src_strideq+1]
%else
@ -601,21 +653,31 @@ SECTION .text
%endif
%endif
pavgb m2, m3
%if mmsize == 16
%if %1 > 4
movlhps m0, m2
movhlps m4, m2
%else ; mmsize == 8
%else ; 4xh
punpckldq m0, m2
pshufw m4, m2, 0xe
pshuflw m4, m2, 0xe
%endif
movh m1, [dstq]
pavgb m0, m2
movh m3, [dstq+dst_strideq]
%if %1 > 4
pavgb m0, [secq]
%else
movh m2, [secq]
pavgb m0, m2
%endif
punpcklbw m3, m5
punpcklbw m1, m5
%if %1 > 4
punpckhbw m2, m0, m5
punpcklbw m0, m5
%else
punpcklbw m0, m5
movhlps m2, m0
%endif
%else ; !avg
movh m4, [srcq+src_strideq]
movh m1, [srcq+src_strideq+1]
@ -641,7 +703,7 @@ SECTION .text
%endif
dec block_height
jg .x_half_y_half_loop
STORE_AND_RET
STORE_AND_RET %1
.x_half_y_nonhalf:
; x_offset == 0.5 && y_offset == bilin interpolation
@ -649,7 +711,7 @@ SECTION .text
lea bilin_filter, [bilin_filter_m]
%endif
shl y_offsetd, filter_idx_shift
%if ARCH_X86_64 && mmsize == 16
%if ARCH_X86_64 && %1 > 4
mova m8, [bilin_filter+y_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+y_offsetq+16]
@ -766,10 +828,20 @@ SECTION .text
psraw m2, 4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
%if %1 == 4
movlhps m0, m2
%endif
packuswb m0, m2
%if %1 > 4
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
%else
movh m2, [secq]
pavgb m0, m2
punpcklbw m0, m5
movhlps m2, m0
%endif
%endif
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
@ -786,7 +858,7 @@ SECTION .text
%undef filter_y_a
%undef filter_y_b
%undef filter_rnd
STORE_AND_RET
STORE_AND_RET %1
.x_nonhalf:
test y_offsetd, y_offsetd
@ -797,7 +869,7 @@ SECTION .text
lea bilin_filter, [bilin_filter_m]
%endif
shl x_offsetd, filter_idx_shift
%if ARCH_X86_64 && mmsize == 16
%if ARCH_X86_64 && %1 > 4
mova m8, [bilin_filter+x_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+x_offsetq+16]
@ -899,10 +971,20 @@ SECTION .text
psraw m2, 4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
%if %1 == 4
movlhps m0, m2
%endif
packuswb m0, m2
%if %1 > 4
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
%else
movh m2, [secq]
pavgb m0, m2
punpcklbw m0, m5
movhlps m2, m0
%endif
%endif
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
@ -918,7 +1000,7 @@ SECTION .text
%undef filter_x_a
%undef filter_x_b
%undef filter_rnd
STORE_AND_RET
STORE_AND_RET %1
.x_nonhalf_y_nonzero:
cmp y_offsetd, 4
@ -929,7 +1011,7 @@ SECTION .text
lea bilin_filter, [bilin_filter_m]
%endif
shl x_offsetd, filter_idx_shift
%if ARCH_X86_64 && mmsize == 16
%if ARCH_X86_64 && %1 > 4
mova m8, [bilin_filter+x_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+x_offsetq+16]
@ -1089,10 +1171,20 @@ SECTION .text
pavgw m2, m4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline - also consider going to bytes here
%if %1 == 4
movlhps m0, m2
%endif
packuswb m0, m2
%if %1 > 4
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
%else
movh m2, [secq]
pavgb m0, m2
punpcklbw m0, m5
movhlps m2, m0
%endif
%endif
punpcklbw m3, m5
punpcklbw m1, m5
@ -1110,7 +1202,7 @@ SECTION .text
%undef filter_x_a
%undef filter_x_b
%undef filter_rnd
STORE_AND_RET
STORE_AND_RET %1
.x_nonhalf_y_nonhalf:
%ifdef PIC
@ -1118,7 +1210,7 @@ SECTION .text
%endif
shl x_offsetd, filter_idx_shift
shl y_offsetd, filter_idx_shift
%if ARCH_X86_64 && mmsize == 16
%if ARCH_X86_64 && %1 > 4
mova m8, [bilin_filter+x_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+x_offsetq+16]
@ -1345,10 +1437,20 @@ SECTION .text
%endif
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
%if %1 == 4
movlhps m0, m2
%endif
packuswb m0, m2
%if %1 > 4
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
%else
movh m2, [secq]
pavgb m0, m2
punpcklbw m0, m5
movhlps m2, m0
%endif
%endif
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
@ -1366,7 +1468,7 @@ SECTION .text
%undef filter_y_a
%undef filter_y_b
%undef filter_rnd
STORE_AND_RET
STORE_AND_RET %1
%endmacro
; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
@ -1375,26 +1477,22 @@ SECTION .text
; location in the sse/2 version, rather than duplicating that code in the
; binary.
INIT_MMX sse
SUBPEL_VARIANCE 4
INIT_XMM sse2
SUBPEL_VARIANCE 4
SUBPEL_VARIANCE 8
SUBPEL_VARIANCE 16
INIT_MMX ssse3
SUBPEL_VARIANCE 4
INIT_XMM ssse3
SUBPEL_VARIANCE 4
SUBPEL_VARIANCE 8
SUBPEL_VARIANCE 16
INIT_MMX sse
SUBPEL_VARIANCE 4, 1
INIT_XMM sse2
SUBPEL_VARIANCE 4, 1
SUBPEL_VARIANCE 8, 1
SUBPEL_VARIANCE 16, 1
INIT_MMX ssse3
SUBPEL_VARIANCE 4, 1
INIT_XMM ssse3
SUBPEL_VARIANCE 4, 1
SUBPEL_VARIANCE 8, 1
SUBPEL_VARIANCE 16, 1

View File

@ -320,11 +320,11 @@ unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
int height, unsigned int *sse, \
void *unused0, void *unused)
#define DECLS(opt1, opt2) \
DECL(4, opt2); \
DECL(4, opt1); \
DECL(8, opt1); \
DECL(16, opt1)
DECLS(sse2, sse);
DECLS(sse2, sse2);
DECLS(ssse3, ssse3);
#undef DECLS
#undef DECL
@ -380,10 +380,10 @@ FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)); \
FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)); \
FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)); \
FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)); \
FN(4, 8, 4, 2, 3, opt2, (int32_t), (int32_t)); \
FN(4, 4, 4, 2, 2, opt2, (int32_t), (int32_t))
FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)); \
FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
FNS(sse2, sse);
FNS(sse2, sse2);
FNS(ssse3, ssse3);
#undef FNS
@ -401,11 +401,11 @@ int vpx_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
int height, unsigned int *sse, \
void *unused0, void *unused)
#define DECLS(opt1, opt2) \
DECL(4, opt2); \
DECL(4, opt1); \
DECL(8, opt1); \
DECL(16, opt1)
DECLS(sse2, sse);
DECLS(sse2, sse2);
DECLS(ssse3, ssse3);
#undef DECL
#undef DECLS
@ -466,8 +466,8 @@ FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \
FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)); \
FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)); \
FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)); \
FN(4, 8, 4, 2, 3, opt2, (uint32_t), (int32_t)); \
FN(4, 4, 4, 2, 2, opt2, (uint32_t), (int32_t))
FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)); \
FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
FNS(sse2, sse);
FNS(ssse3, ssse3);