Removing variance MMX code.
Removed functions: * vp9_mse16x16_mmx * vp9_get_mb_ss_mmx * vp9_get4x4var_mmx * vp9_get8x8var_mmx * vp9_variance4x4_mmx * vp9_variance8x8_mmx * vp9_variance16x16_mmx * vp9_variance16x8_mmx * vp9_variance8x16_mmx They all have SSE2 equivalent. Change-Id: I3796f2477c4f59b35b4828f46a300c16e62a2615
This commit is contained in:
parent
8e78a0d365
commit
12cd6f421d
@ -485,21 +485,6 @@ INSTANTIATE_TEST_CASE_P(
|
||||
make_tuple(6, 5, subpel_avg_variance64x32_c),
|
||||
make_tuple(6, 6, subpel_avg_variance64x64_c)));
|
||||
|
||||
#if HAVE_MMX
|
||||
const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx;
|
||||
const vp9_variance_fn_t variance8x8_mmx = vp9_variance8x8_mmx;
|
||||
const vp9_variance_fn_t variance8x16_mmx = vp9_variance8x16_mmx;
|
||||
const vp9_variance_fn_t variance16x8_mmx = vp9_variance16x8_mmx;
|
||||
const vp9_variance_fn_t variance16x16_mmx = vp9_variance16x16_mmx;
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
MMX, VP9VarianceTest,
|
||||
::testing::Values(make_tuple(2, 2, variance4x4_mmx),
|
||||
make_tuple(3, 3, variance8x8_mmx),
|
||||
make_tuple(3, 4, variance8x16_mmx),
|
||||
make_tuple(4, 3, variance16x8_mmx),
|
||||
make_tuple(4, 4, variance16x16_mmx)));
|
||||
#endif
|
||||
|
||||
#if HAVE_SSE2
|
||||
#if CONFIG_USE_X86INC
|
||||
const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
|
||||
|
@ -420,19 +420,19 @@ add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int sourc
|
||||
specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_variance16x16 mmx avx2 neon/, "$sse2_x86inc";
|
||||
specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc";
|
||||
specialize qw/vp9_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_variance8x16 mmx/, "$sse2_x86inc";
|
||||
specialize qw/vp9_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_variance8x8 mmx neon/, "$sse2_x86inc";
|
||||
specialize qw/vp9_variance8x8 neon/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vp9_get8x8var mmx neon/, "$sse2_x86inc";
|
||||
specialize qw/vp9_get8x8var neon/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc";
|
||||
@ -444,7 +444,7 @@ add_proto qw/unsigned int vp9_variance4x8/, "const uint8_t *src_ptr, int source_
|
||||
specialize qw/vp9_variance4x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_variance4x4 mmx/, "$sse2_x86inc";
|
||||
specialize qw/vp9_variance4x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
@ -693,7 +693,7 @@ add_proto qw/void vp9_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, cons
|
||||
specialize qw/vp9_sad4x4x4d sse/;
|
||||
|
||||
add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_mse16x16 mmx avx2/, "$sse2_x86inc";
|
||||
specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_mse8x16/;
|
||||
@ -705,7 +705,7 @@ add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int source_stri
|
||||
specialize qw/vp9_mse8x8/;
|
||||
|
||||
add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
|
||||
specialize qw/vp9_get_mb_ss mmx sse2/;
|
||||
specialize qw/vp9_get_mb_ss sse2/;
|
||||
# ENCODEMB INVOKE
|
||||
|
||||
add_proto qw/int64_t vp9_block_error/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz";
|
||||
|
@ -1,510 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
;unsigned int vp9_get_mb_ss_mmx( short *src_ptr )
|
||||
global sym(vp9_get_mb_ss_mmx) PRIVATE
|
||||
sym(vp9_get_mb_ss_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
sub rsp, 8
|
||||
; end prolog
|
||||
|
||||
mov rax, arg(0) ;src_ptr
|
||||
mov rcx, 16
|
||||
pxor mm4, mm4
|
||||
|
||||
.NEXTROW:
|
||||
movq mm0, [rax]
|
||||
movq mm1, [rax+8]
|
||||
movq mm2, [rax+16]
|
||||
movq mm3, [rax+24]
|
||||
pmaddwd mm0, mm0
|
||||
pmaddwd mm1, mm1
|
||||
pmaddwd mm2, mm2
|
||||
pmaddwd mm3, mm3
|
||||
|
||||
paddd mm4, mm0
|
||||
paddd mm4, mm1
|
||||
paddd mm4, mm2
|
||||
paddd mm4, mm3
|
||||
|
||||
add rax, 32
|
||||
dec rcx
|
||||
ja .NEXTROW
|
||||
movq QWORD PTR [rsp], mm4
|
||||
|
||||
;return sum[0]+sum[1];
|
||||
movsxd rax, dword ptr [rsp]
|
||||
movsxd rcx, dword ptr [rsp+4]
|
||||
add rax, rcx
|
||||
|
||||
|
||||
; begin epilog
|
||||
add rsp, 8
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;unsigned int vp9_get8x8var_mmx
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; int source_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int recon_stride,
|
||||
; unsigned int *SSE,
|
||||
; int *Sum
|
||||
;)
|
||||
global sym(vp9_get8x8var_mmx) PRIVATE
|
||||
sym(vp9_get8x8var_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
sub rsp, 16
|
||||
; end prolog
|
||||
|
||||
|
||||
pxor mm5, mm5 ; Blank mmx6
|
||||
pxor mm6, mm6 ; Blank mmx7
|
||||
pxor mm7, mm7 ; Blank mmx7
|
||||
|
||||
mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
||||
mov rbx, arg(2) ;[ref_ptr]
|
||||
movsxd rcx, dword ptr arg(1) ;[source_stride]
|
||||
movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
||||
|
||||
; Row 1
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
|
||||
; Row 2
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Row 3
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Row 4
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Row 5
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
; movq mm4, [rbx + rdx]
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Row 6
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Row 7
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Row 8
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Now accumulate the final results.
|
||||
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
|
||||
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
|
||||
movsx rdx, WORD PTR [rsp+8]
|
||||
movsx rcx, WORD PTR [rsp+10]
|
||||
movsx rbx, WORD PTR [rsp+12]
|
||||
movsx rax, WORD PTR [rsp+14]
|
||||
add rdx, rcx
|
||||
add rbx, rax
|
||||
add rdx, rbx ;XSum
|
||||
movsxd rax, DWORD PTR [rsp]
|
||||
movsxd rcx, DWORD PTR [rsp+4]
|
||||
add rax, rcx ;XXSum
|
||||
mov rsi, arg(4) ;SSE
|
||||
mov rdi, arg(5) ;Sum
|
||||
mov dword ptr [rsi], eax
|
||||
mov dword ptr [rdi], edx
|
||||
xor rax, rax ; return 0
|
||||
|
||||
|
||||
; begin epilog
|
||||
add rsp, 16
|
||||
pop rbx
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
|
||||
;unsigned int
|
||||
;vp9_get4x4var_mmx
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; int source_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int recon_stride,
|
||||
; unsigned int *SSE,
|
||||
; int *Sum
|
||||
;)
|
||||
global sym(vp9_get4x4var_mmx) PRIVATE
|
||||
sym(vp9_get4x4var_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
sub rsp, 16
|
||||
; end prolog
|
||||
|
||||
|
||||
pxor mm5, mm5 ; Blank mmx6
|
||||
pxor mm6, mm6 ; Blank mmx7
|
||||
pxor mm7, mm7 ; Blank mmx7
|
||||
|
||||
mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
||||
mov rbx, arg(2) ;[ref_ptr]
|
||||
movsxd rcx, dword ptr arg(1) ;[source_stride]
|
||||
movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
||||
|
||||
; Row 1
|
||||
movd mm0, [rax] ; Copy 4 bytes to mm0
|
||||
movd mm1, [rbx] ; Copy 4 bytes to mm1
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movd mm1, [rbx] ; Copy 4 bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
|
||||
; Row 2
|
||||
movd mm0, [rax] ; Copy 4 bytes to mm0
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movd mm1, [rbx] ; Copy 4 bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
; Row 3
|
||||
movd mm0, [rax] ; Copy 4 bytes to mm0
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movd mm1, [rbx] ; Copy 4 bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
; Row 4
|
||||
movd mm0, [rax] ; Copy 4 bytes to mm0
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
|
||||
; Now accumulate the final results.
|
||||
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
|
||||
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
|
||||
movsx rdx, WORD PTR [rsp+8]
|
||||
movsx rcx, WORD PTR [rsp+10]
|
||||
movsx rbx, WORD PTR [rsp+12]
|
||||
movsx rax, WORD PTR [rsp+14]
|
||||
add rdx, rcx
|
||||
add rbx, rax
|
||||
add rdx, rbx ;XSum
|
||||
movsxd rax, DWORD PTR [rsp]
|
||||
movsxd rcx, DWORD PTR [rsp+4]
|
||||
add rax, rcx ;XXSum
|
||||
mov rsi, arg(4) ;SSE
|
||||
mov rdi, arg(5) ;Sum
|
||||
mov dword ptr [rsi], eax
|
||||
mov dword ptr [rdi], edx
|
||||
xor rax, rax ; return 0
|
||||
|
||||
|
||||
; begin epilog
|
||||
add rsp, 16
|
||||
pop rbx
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
|
||||
;unsigned int
|
||||
;vp9_get4x4sse_cs_mmx
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; int source_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int recon_stride
|
||||
;)
|
||||
global sym(vp9_get4x4sse_cs_mmx) PRIVATE
|
||||
sym(vp9_get4x4sse_cs_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
; end prolog
|
||||
|
||||
|
||||
pxor mm6, mm6 ; Blank mmx7
|
||||
pxor mm7, mm7 ; Blank mmx7
|
||||
|
||||
mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
||||
mov rbx, arg(2) ;[ref_ptr]
|
||||
movsxd rcx, dword ptr arg(1) ;[source_stride]
|
||||
movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
||||
; Row 1
|
||||
movd mm0, [rax] ; Copy eight bytes to mm0
|
||||
movd mm1, [rbx] ; Copy eight bytes to mm1
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movd mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
; Row 2
|
||||
movd mm0, [rax] ; Copy eight bytes to mm0
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movd mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
; Row 3
|
||||
movd mm0, [rax] ; Copy eight bytes to mm0
|
||||
punpcklbw mm1, mm6
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movd mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
; Row 4
|
||||
movd mm0, [rax] ; Copy eight bytes to mm0
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
movq mm0, mm7 ;
|
||||
psrlq mm7, 32
|
||||
|
||||
paddd mm0, mm7
|
||||
movq rax, mm0
|
||||
|
||||
|
||||
; begin epilog
|
||||
pop rbx
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
@ -1,103 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vpx_config.h"
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
unsigned int vp9_get8x8var_mmx(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum);
|
||||
|
||||
unsigned int vp9_get4x4var_mmx(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *SSE, int *sum);
|
||||
|
||||
unsigned int vp9_variance4x4_mmx(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
vp9_get4x4var_mmx(src, src_stride, ref, ref_stride, sse, &sum);
|
||||
return *sse - (((unsigned int)sum * sum) >> 4);
|
||||
}
|
||||
|
||||
unsigned int vp9_variance8x8_mmx(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, sse, &sum);
|
||||
return *sse - (((unsigned int)sum * sum) >> 6);
|
||||
}
|
||||
|
||||
unsigned int vp9_mse16x16_mmx(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
unsigned int sse0, sse1, sse2, sse3;
|
||||
int sum0, sum1, sum2, sum3;
|
||||
|
||||
vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
|
||||
vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
|
||||
vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
|
||||
ref + 8 * ref_stride, ref_stride, &sse2, &sum2);
|
||||
vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride,
|
||||
ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3);
|
||||
|
||||
*sse = sse0 + sse1 + sse2 + sse3;
|
||||
return *sse;
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp9_variance16x16_mmx(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
unsigned int sse0, sse1, sse2, sse3;
|
||||
int sum0, sum1, sum2, sum3, sum;
|
||||
|
||||
vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
|
||||
vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
|
||||
vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
|
||||
ref + 8 * ref_stride, ref_stride, &sse2, &sum2);
|
||||
vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride,
|
||||
ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3);
|
||||
|
||||
*sse = sse0 + sse1 + sse2 + sse3;
|
||||
sum = sum0 + sum1 + sum2 + sum3;
|
||||
return *sse - (((unsigned int)sum * sum) >> 8);
|
||||
}
|
||||
|
||||
unsigned int vp9_variance16x8_mmx(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
unsigned int sse0, sse1;
|
||||
int sum0, sum1, sum;
|
||||
|
||||
vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
|
||||
vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
|
||||
|
||||
*sse = sse0 + sse1;
|
||||
sum = sum0 + sum1;
|
||||
return *sse - (((unsigned int)sum * sum) >> 7);
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp9_variance8x16_mmx(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
unsigned int sse0, sse1;
|
||||
int sum0, sum1, sum;
|
||||
|
||||
vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
|
||||
vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
|
||||
ref + 8 * ref_stride, ref_stride, &sse1, &sum1);
|
||||
|
||||
*sse = sse0 + sse1;
|
||||
sum = sum0 + sum1;
|
||||
return *sse - (((unsigned int)sum * sum) >> 7);
|
||||
}
|
@ -93,8 +93,6 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
|
||||
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
|
||||
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
|
||||
|
||||
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c
|
||||
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm
|
||||
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
|
||||
|
Loading…
x
Reference in New Issue
Block a user