remove mmx variance functions
there are sse2 equivalents which is a reasonable modern baseline Removed mmx variance functions: vpx_get_mb_ss_mmx() vpx_get8x8var_mmx() vpx_get4x4var_mmx() vpx_variance4x4_mmx() vpx_variance8x8_mmx() vpx_mse16x16_mmx() vpx_variance16x16_mmx() vpx_variance16x8_mmx() vpx_variance8x16_mmx() Change-Id: Iffaf85344c6676a3dd337c0645a2dd5deb2f86a1
This commit is contained in:
@@ -977,20 +977,6 @@ INSTANTIATE_TEST_CASE_P(
|
|||||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||||
|
|
||||||
#if HAVE_MMX
|
#if HAVE_MMX
|
||||||
INSTANTIATE_TEST_CASE_P(MMX, VpxMseTest,
|
|
||||||
::testing::Values(make_tuple(4, 4, &vpx_mse16x16_mmx)));
|
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(MMX, SumOfSquaresTest,
|
|
||||||
::testing::Values(vpx_get_mb_ss_mmx));
|
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(
|
|
||||||
MMX, VpxVarianceTest,
|
|
||||||
::testing::Values(make_tuple(4, 4, &vpx_variance16x16_mmx, 0),
|
|
||||||
make_tuple(4, 3, &vpx_variance16x8_mmx, 0),
|
|
||||||
make_tuple(3, 4, &vpx_variance8x16_mmx, 0),
|
|
||||||
make_tuple(3, 3, &vpx_variance8x8_mmx, 0),
|
|
||||||
make_tuple(2, 2, &vpx_variance4x4_mmx, 0)));
|
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(
|
INSTANTIATE_TEST_CASE_P(
|
||||||
MMX, VpxSubpelVarianceTest,
|
MMX, VpxSubpelVarianceTest,
|
||||||
::testing::Values(make_tuple(4, 4, &vpx_sub_pixel_variance16x16_mmx, 0),
|
::testing::Values(make_tuple(4, 4, &vpx_sub_pixel_variance16x16_mmx, 0),
|
||||||
|
@@ -1407,16 +1407,16 @@ add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int sourc
|
|||||||
specialize qw/vpx_variance16x32 sse2 msa/;
|
specialize qw/vpx_variance16x32 sse2 msa/;
|
||||||
|
|
||||||
add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon msa/;
|
specialize qw/vpx_variance16x16 sse2 avx2 media neon msa/;
|
||||||
|
|
||||||
add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
specialize qw/vpx_variance16x8 mmx sse2 neon msa/;
|
specialize qw/vpx_variance16x8 sse2 neon msa/;
|
||||||
|
|
||||||
add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
specialize qw/vpx_variance8x16 mmx sse2 neon msa/;
|
specialize qw/vpx_variance8x16 sse2 neon msa/;
|
||||||
|
|
||||||
add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
specialize qw/vpx_variance8x8 mmx sse2 media neon msa/;
|
specialize qw/vpx_variance8x8 sse2 media neon msa/;
|
||||||
|
|
||||||
add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
specialize qw/vpx_variance8x4 sse2 msa/;
|
specialize qw/vpx_variance8x4 sse2 msa/;
|
||||||
@@ -1425,7 +1425,7 @@ add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_
|
|||||||
specialize qw/vpx_variance4x8 sse2 msa/;
|
specialize qw/vpx_variance4x8 sse2 msa/;
|
||||||
|
|
||||||
add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
specialize qw/vpx_variance4x4 mmx sse2 msa/;
|
specialize qw/vpx_variance4x4 sse2 msa/;
|
||||||
|
|
||||||
#
|
#
|
||||||
# Specialty Variance
|
# Specialty Variance
|
||||||
@@ -1434,10 +1434,10 @@ add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride,
|
|||||||
specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
|
specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
|
||||||
|
|
||||||
add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||||
specialize qw/vpx_get8x8var mmx sse2 neon msa/;
|
specialize qw/vpx_get8x8var sse2 neon msa/;
|
||||||
|
|
||||||
add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||||
specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon msa/;
|
specialize qw/vpx_mse16x16 sse2 avx2 media neon msa/;
|
||||||
|
|
||||||
add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||||
specialize qw/vpx_mse16x8 sse2 msa/;
|
specialize qw/vpx_mse16x8 sse2 msa/;
|
||||||
@@ -1449,7 +1449,7 @@ add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int source_stri
|
|||||||
specialize qw/vpx_mse8x8 sse2 msa/;
|
specialize qw/vpx_mse8x8 sse2 msa/;
|
||||||
|
|
||||||
add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
|
add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
|
||||||
specialize qw/vpx_get_mb_ss mmx sse2 msa/;
|
specialize qw/vpx_get_mb_ss sse2 msa/;
|
||||||
|
|
||||||
add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
|
add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
|
||||||
specialize qw/vpx_get4x4sse_cs neon msa/;
|
specialize qw/vpx_get4x4sse_cs neon msa/;
|
||||||
|
@@ -13,407 +13,6 @@
|
|||||||
|
|
||||||
%define mmx_filter_shift 7
|
%define mmx_filter_shift 7
|
||||||
|
|
||||||
;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
|
|
||||||
global sym(vpx_get_mb_ss_mmx) PRIVATE
|
|
||||||
sym(vpx_get_mb_ss_mmx):
|
|
||||||
push rbp
|
|
||||||
mov rbp, rsp
|
|
||||||
SHADOW_ARGS_TO_STACK 7
|
|
||||||
GET_GOT rbx
|
|
||||||
push rsi
|
|
||||||
push rdi
|
|
||||||
sub rsp, 8
|
|
||||||
; end prolog
|
|
||||||
|
|
||||||
mov rax, arg(0) ;src_ptr
|
|
||||||
mov rcx, 16
|
|
||||||
pxor mm4, mm4
|
|
||||||
|
|
||||||
.NEXTROW:
|
|
||||||
movq mm0, [rax]
|
|
||||||
movq mm1, [rax+8]
|
|
||||||
movq mm2, [rax+16]
|
|
||||||
movq mm3, [rax+24]
|
|
||||||
pmaddwd mm0, mm0
|
|
||||||
pmaddwd mm1, mm1
|
|
||||||
pmaddwd mm2, mm2
|
|
||||||
pmaddwd mm3, mm3
|
|
||||||
|
|
||||||
paddd mm4, mm0
|
|
||||||
paddd mm4, mm1
|
|
||||||
paddd mm4, mm2
|
|
||||||
paddd mm4, mm3
|
|
||||||
|
|
||||||
add rax, 32
|
|
||||||
dec rcx
|
|
||||||
ja .NEXTROW
|
|
||||||
movq QWORD PTR [rsp], mm4
|
|
||||||
|
|
||||||
;return sum[0]+sum[1];
|
|
||||||
movsxd rax, dword ptr [rsp]
|
|
||||||
movsxd rcx, dword ptr [rsp+4]
|
|
||||||
add rax, rcx
|
|
||||||
|
|
||||||
; begin epilog
|
|
||||||
add rsp, 8
|
|
||||||
pop rdi
|
|
||||||
pop rsi
|
|
||||||
RESTORE_GOT
|
|
||||||
UNSHADOW_ARGS
|
|
||||||
pop rbp
|
|
||||||
ret
|
|
||||||
|
|
||||||
;void vpx_get8x8var_mmx
|
|
||||||
;(
|
|
||||||
; unsigned char *src_ptr,
|
|
||||||
; int source_stride,
|
|
||||||
; unsigned char *ref_ptr,
|
|
||||||
; int recon_stride,
|
|
||||||
; unsigned int *SSE,
|
|
||||||
; int *Sum
|
|
||||||
;)
|
|
||||||
global sym(vpx_get8x8var_mmx) PRIVATE
|
|
||||||
sym(vpx_get8x8var_mmx):
|
|
||||||
push rbp
|
|
||||||
mov rbp, rsp
|
|
||||||
SHADOW_ARGS_TO_STACK 6
|
|
||||||
push rsi
|
|
||||||
push rdi
|
|
||||||
push rbx
|
|
||||||
sub rsp, 16
|
|
||||||
; end prolog
|
|
||||||
|
|
||||||
pxor mm5, mm5 ; Blank mmx6
|
|
||||||
pxor mm6, mm6 ; Blank mmx7
|
|
||||||
pxor mm7, mm7 ; Blank mmx7
|
|
||||||
|
|
||||||
mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
|
||||||
mov rbx, arg(2) ;[ref_ptr]
|
|
||||||
movsxd rcx, dword ptr arg(1) ;[source_stride]
|
|
||||||
movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
|
||||||
|
|
||||||
; Row 1
|
|
||||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
|
||||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
|
||||||
movq mm2, mm0 ; Take copies
|
|
||||||
movq mm3, mm1 ; Take copies
|
|
||||||
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
|
||||||
punpckhbw mm3, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
|
||||||
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
paddw mm5, mm2 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
pmaddwd mm2, mm2 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
paddd mm7, mm2 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 2
|
|
||||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
|
||||||
movq mm2, mm0 ; Take copies
|
|
||||||
movq mm3, mm1 ; Take copies
|
|
||||||
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
|
||||||
punpckhbw mm3, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
|
||||||
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
paddw mm5, mm2 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
pmaddwd mm2, mm2 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
paddd mm7, mm2 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 3
|
|
||||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
|
||||||
movq mm2, mm0 ; Take copies
|
|
||||||
movq mm3, mm1 ; Take copies
|
|
||||||
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
|
||||||
punpckhbw mm3, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
|
||||||
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
paddw mm5, mm2 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
pmaddwd mm2, mm2 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
paddd mm7, mm2 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 4
|
|
||||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
|
||||||
movq mm2, mm0 ; Take copies
|
|
||||||
movq mm3, mm1 ; Take copies
|
|
||||||
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
|
||||||
punpckhbw mm3, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
|
||||||
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
paddw mm5, mm2 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
pmaddwd mm2, mm2 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
paddd mm7, mm2 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 5
|
|
||||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
|
||||||
movq mm2, mm0 ; Take copies
|
|
||||||
movq mm3, mm1 ; Take copies
|
|
||||||
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
|
||||||
punpckhbw mm3, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
|
||||||
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
paddw mm5, mm2 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
pmaddwd mm2, mm2 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
|
||||||
; movq mm4, [rbx + rdx]
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
paddd mm7, mm2 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 6
|
|
||||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
|
||||||
movq mm2, mm0 ; Take copies
|
|
||||||
movq mm3, mm1 ; Take copies
|
|
||||||
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
|
||||||
punpckhbw mm3, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
|
||||||
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
paddw mm5, mm2 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
pmaddwd mm2, mm2 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
paddd mm7, mm2 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 7
|
|
||||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
|
||||||
movq mm2, mm0 ; Take copies
|
|
||||||
movq mm3, mm1 ; Take copies
|
|
||||||
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
|
||||||
punpckhbw mm3, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
|
||||||
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
paddw mm5, mm2 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
pmaddwd mm2, mm2 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
paddd mm7, mm2 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 8
|
|
||||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
|
||||||
movq mm2, mm0 ; Take copies
|
|
||||||
movq mm3, mm1 ; Take copies
|
|
||||||
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
|
||||||
punpckhbw mm3, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
|
||||||
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
paddw mm5, mm2 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
pmaddwd mm2, mm2 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
paddd mm7, mm2 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Now accumulate the final results.
|
|
||||||
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
|
|
||||||
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
|
|
||||||
movsx rdx, WORD PTR [rsp+8]
|
|
||||||
movsx rcx, WORD PTR [rsp+10]
|
|
||||||
movsx rbx, WORD PTR [rsp+12]
|
|
||||||
movsx rax, WORD PTR [rsp+14]
|
|
||||||
add rdx, rcx
|
|
||||||
add rbx, rax
|
|
||||||
add rdx, rbx ;XSum
|
|
||||||
movsxd rax, DWORD PTR [rsp]
|
|
||||||
movsxd rcx, DWORD PTR [rsp+4]
|
|
||||||
add rax, rcx ;XXSum
|
|
||||||
mov rsi, arg(4) ;SSE
|
|
||||||
mov rdi, arg(5) ;Sum
|
|
||||||
mov dword ptr [rsi], eax
|
|
||||||
mov dword ptr [rdi], edx
|
|
||||||
xor rax, rax ; return 0
|
|
||||||
|
|
||||||
; begin epilog
|
|
||||||
add rsp, 16
|
|
||||||
pop rbx
|
|
||||||
pop rdi
|
|
||||||
pop rsi
|
|
||||||
UNSHADOW_ARGS
|
|
||||||
pop rbp
|
|
||||||
ret
|
|
||||||
|
|
||||||
;void
|
|
||||||
;vpx_get4x4var_mmx
|
|
||||||
;(
|
|
||||||
; unsigned char *src_ptr,
|
|
||||||
; int source_stride,
|
|
||||||
; unsigned char *ref_ptr,
|
|
||||||
; int recon_stride,
|
|
||||||
; unsigned int *SSE,
|
|
||||||
; int *Sum
|
|
||||||
;)
|
|
||||||
global sym(vpx_get4x4var_mmx) PRIVATE
|
|
||||||
sym(vpx_get4x4var_mmx):
|
|
||||||
push rbp
|
|
||||||
mov rbp, rsp
|
|
||||||
SHADOW_ARGS_TO_STACK 6
|
|
||||||
push rsi
|
|
||||||
push rdi
|
|
||||||
push rbx
|
|
||||||
sub rsp, 16
|
|
||||||
; end prolog
|
|
||||||
|
|
||||||
pxor mm5, mm5 ; Blank mmx6
|
|
||||||
pxor mm6, mm6 ; Blank mmx7
|
|
||||||
pxor mm7, mm7 ; Blank mmx7
|
|
||||||
|
|
||||||
mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
|
||||||
mov rbx, arg(2) ;[ref_ptr]
|
|
||||||
movsxd rcx, dword ptr arg(1) ;[source_stride]
|
|
||||||
movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
|
||||||
|
|
||||||
; Row 1
|
|
||||||
movd mm0, [rax] ; Copy four bytes to mm0
|
|
||||||
movd mm1, [rbx] ; Copy four bytes to mm1
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movd mm1, [rbx] ; Copy four bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 2
|
|
||||||
movd mm0, [rax] ; Copy four bytes to mm0
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movd mm1, [rbx] ; Copy four bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 3
|
|
||||||
movd mm0, [rax] ; Copy four bytes to mm0
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher precision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movd mm1, [rbx] ; Copy four bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 4
|
|
||||||
movd mm0, [rax] ; Copy four bytes to mm0
|
|
||||||
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Now accumulate the final results.
|
|
||||||
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
|
|
||||||
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
|
|
||||||
movsx rdx, WORD PTR [rsp+8]
|
|
||||||
movsx rcx, WORD PTR [rsp+10]
|
|
||||||
movsx rbx, WORD PTR [rsp+12]
|
|
||||||
movsx rax, WORD PTR [rsp+14]
|
|
||||||
add rdx, rcx
|
|
||||||
add rbx, rax
|
|
||||||
add rdx, rbx ;XSum
|
|
||||||
movsxd rax, DWORD PTR [rsp]
|
|
||||||
movsxd rcx, DWORD PTR [rsp+4]
|
|
||||||
add rax, rcx ;XXSum
|
|
||||||
mov rsi, arg(4) ;SSE
|
|
||||||
mov rdi, arg(5) ;Sum
|
|
||||||
mov dword ptr [rsi], eax
|
|
||||||
mov dword ptr [rdi], edx
|
|
||||||
xor rax, rax ; return 0
|
|
||||||
|
|
||||||
; begin epilog
|
|
||||||
add rsp, 16
|
|
||||||
pop rbx
|
|
||||||
pop rdi
|
|
||||||
pop rsi
|
|
||||||
UNSHADOW_ARGS
|
|
||||||
pop rbp
|
|
||||||
ret
|
|
||||||
|
|
||||||
;void vpx_filter_block2d_bil4x4_var_mmx
|
;void vpx_filter_block2d_bil4x4_var_mmx
|
||||||
;(
|
;(
|
||||||
; unsigned char *ref_ptr,
|
; unsigned char *ref_ptr,
|
||||||
|
@@ -23,10 +23,6 @@ DECLARE_ALIGNED(16, static const int16_t, bilinear_filters_mmx[8][8]) = {
|
|||||||
{ 16, 16, 16, 16, 112, 112, 112, 112 }
|
{ 16, 16, 16, 16, 112, 112, 112, 112 }
|
||||||
};
|
};
|
||||||
|
|
||||||
extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,
|
|
||||||
const uint8_t *b, int b_stride,
|
|
||||||
unsigned int *sse, int *sum);
|
|
||||||
|
|
||||||
extern void vpx_filter_block2d_bil4x4_var_mmx(const unsigned char *ref_ptr,
|
extern void vpx_filter_block2d_bil4x4_var_mmx(const unsigned char *ref_ptr,
|
||||||
int ref_pixels_per_line,
|
int ref_pixels_per_line,
|
||||||
const unsigned char *src_ptr,
|
const unsigned char *src_ptr,
|
||||||
@@ -47,98 +43,6 @@ extern void vpx_filter_block2d_bil_var_mmx(const unsigned char *ref_ptr,
|
|||||||
unsigned int *sumsquared);
|
unsigned int *sumsquared);
|
||||||
|
|
||||||
|
|
||||||
unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
|
|
||||||
const unsigned char *b, int b_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
unsigned int var;
|
|
||||||
int avg;
|
|
||||||
|
|
||||||
vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg);
|
|
||||||
*sse = var;
|
|
||||||
return (var - (((unsigned int)avg * avg) >> 4));
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
|
|
||||||
const unsigned char *b, int b_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
unsigned int var;
|
|
||||||
int avg;
|
|
||||||
|
|
||||||
vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg);
|
|
||||||
*sse = var;
|
|
||||||
|
|
||||||
return (var - (((unsigned int)avg * avg) >> 6));
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
|
|
||||||
const unsigned char *b, int b_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
unsigned int sse0, sse1, sse2, sse3, var;
|
|
||||||
int sum0, sum1, sum2, sum3;
|
|
||||||
|
|
||||||
vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
|
|
||||||
vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
|
|
||||||
vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
|
|
||||||
b + 8 * b_stride, b_stride, &sse2, &sum2);
|
|
||||||
vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
|
|
||||||
b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
|
|
||||||
|
|
||||||
var = sse0 + sse1 + sse2 + sse3;
|
|
||||||
*sse = var;
|
|
||||||
return var;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
|
|
||||||
const unsigned char *b, int b_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
unsigned int sse0, sse1, sse2, sse3, var;
|
|
||||||
int sum0, sum1, sum2, sum3, avg;
|
|
||||||
|
|
||||||
vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
|
|
||||||
vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
|
|
||||||
vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
|
|
||||||
b + 8 * b_stride, b_stride, &sse2, &sum2);
|
|
||||||
vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
|
|
||||||
b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
|
|
||||||
|
|
||||||
var = sse0 + sse1 + sse2 + sse3;
|
|
||||||
avg = sum0 + sum1 + sum2 + sum3;
|
|
||||||
*sse = var;
|
|
||||||
return (var - (((unsigned int)avg * avg) >> 8));
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
|
|
||||||
const unsigned char *b, int b_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
unsigned int sse0, sse1, var;
|
|
||||||
int sum0, sum1, avg;
|
|
||||||
|
|
||||||
vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
|
|
||||||
vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
|
|
||||||
|
|
||||||
var = sse0 + sse1;
|
|
||||||
avg = sum0 + sum1;
|
|
||||||
*sse = var;
|
|
||||||
return (var - (((unsigned int)avg * avg) >> 7));
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
|
|
||||||
const unsigned char *b, int b_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
unsigned int sse0, sse1, var;
|
|
||||||
int sum0, sum1, avg;
|
|
||||||
|
|
||||||
vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
|
|
||||||
vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
|
|
||||||
b + 8 * b_stride, b_stride, &sse1, &sum1);
|
|
||||||
|
|
||||||
var = sse0 + sse1;
|
|
||||||
avg = sum0 + sum1;
|
|
||||||
*sse = var;
|
|
||||||
|
|
||||||
return (var - (((unsigned int)avg * avg) >> 7));
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *a, int a_stride,
|
uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *a, int a_stride,
|
||||||
int xoffset, int yoffset,
|
int xoffset, int yoffset,
|
||||||
const uint8_t *b, int b_stride,
|
const uint8_t *b, int b_stride,
|
||||||
|
Reference in New Issue
Block a user