Merge "Remove unused vp9_copy32xn" into experimental
This commit is contained in:
commit
5ac141187a
@ -491,16 +491,6 @@ specialize vp9_sad8x8x4d sse2
|
||||
prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array"
|
||||
specialize vp9_sad4x4x4d sse
|
||||
|
||||
#
|
||||
# Block copy
|
||||
#
|
||||
case $arch in
|
||||
x86*)
|
||||
prototype void vp9_copy32xn "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, int n"
|
||||
specialize vp9_copy32xn sse2 sse3
|
||||
;;
|
||||
esac
|
||||
|
||||
prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
|
||||
specialize vp9_sub_pixel_mse16x16 sse2 mmx
|
||||
|
||||
|
@ -1683,14 +1683,6 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
|
||||
BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
|
||||
NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
|
||||
|
||||
#if ARCH_X86 || ARCH_X86_64
|
||||
cpi->fn_ptr[BLOCK_16X16].copymem = vp9_copy32xn;
|
||||
cpi->fn_ptr[BLOCK_16X8].copymem = vp9_copy32xn;
|
||||
cpi->fn_ptr[BLOCK_8X16].copymem = vp9_copy32xn;
|
||||
cpi->fn_ptr[BLOCK_8X8].copymem = vp9_copy32xn;
|
||||
cpi->fn_ptr[BLOCK_4X4].copymem = vp9_copy32xn;
|
||||
#endif
|
||||
|
||||
cpi->full_search_sad = vp9_full_search_sad;
|
||||
cpi->diamond_search_sad = vp9_diamond_search_sad;
|
||||
cpi->refining_search_sad = vp9_refining_search_sad;
|
||||
|
@ -484,61 +484,3 @@ void vp9_sad4x4x4d_c(const uint8_t *src_ptr,
|
||||
sad_array[3] = vp9_sad4x4_c(src_ptr, src_stride,
|
||||
ref_ptr[3], ref_stride, 0x7fffffff);
|
||||
}
|
||||
|
||||
/* Copy 2 macroblocks to a buffer */
|
||||
void vp9_copy32xn_c(uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
uint8_t *dst_ptr,
|
||||
int dst_stride,
|
||||
int height) {
|
||||
int r;
|
||||
|
||||
for (r = 0; r < height; r++) {
|
||||
#if !(CONFIG_FAST_UNALIGNED)
|
||||
dst_ptr[0] = src_ptr[0];
|
||||
dst_ptr[1] = src_ptr[1];
|
||||
dst_ptr[2] = src_ptr[2];
|
||||
dst_ptr[3] = src_ptr[3];
|
||||
dst_ptr[4] = src_ptr[4];
|
||||
dst_ptr[5] = src_ptr[5];
|
||||
dst_ptr[6] = src_ptr[6];
|
||||
dst_ptr[7] = src_ptr[7];
|
||||
dst_ptr[8] = src_ptr[8];
|
||||
dst_ptr[9] = src_ptr[9];
|
||||
dst_ptr[10] = src_ptr[10];
|
||||
dst_ptr[11] = src_ptr[11];
|
||||
dst_ptr[12] = src_ptr[12];
|
||||
dst_ptr[13] = src_ptr[13];
|
||||
dst_ptr[14] = src_ptr[14];
|
||||
dst_ptr[15] = src_ptr[15];
|
||||
dst_ptr[16] = src_ptr[16];
|
||||
dst_ptr[17] = src_ptr[17];
|
||||
dst_ptr[18] = src_ptr[18];
|
||||
dst_ptr[19] = src_ptr[19];
|
||||
dst_ptr[20] = src_ptr[20];
|
||||
dst_ptr[21] = src_ptr[21];
|
||||
dst_ptr[22] = src_ptr[22];
|
||||
dst_ptr[23] = src_ptr[23];
|
||||
dst_ptr[24] = src_ptr[24];
|
||||
dst_ptr[25] = src_ptr[25];
|
||||
dst_ptr[26] = src_ptr[26];
|
||||
dst_ptr[27] = src_ptr[27];
|
||||
dst_ptr[28] = src_ptr[28];
|
||||
dst_ptr[29] = src_ptr[29];
|
||||
dst_ptr[30] = src_ptr[30];
|
||||
dst_ptr[31] = src_ptr[31];
|
||||
#else
|
||||
((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0];
|
||||
((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1];
|
||||
((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2];
|
||||
((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3];
|
||||
((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4];
|
||||
((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5];
|
||||
((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6];
|
||||
((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7];
|
||||
#endif
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += dst_stride;
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -19,12 +19,6 @@ typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
|
||||
int ref_stride,
|
||||
unsigned int max_sad);
|
||||
|
||||
typedef void (*vp9_copy32xn_fn_t)(const uint8_t *src_ptr,
|
||||
int source_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride,
|
||||
int n);
|
||||
|
||||
typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
|
||||
int source_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
@ -79,7 +73,6 @@ typedef struct variance_vtable {
|
||||
vp9_sad_multi_fn_t sdx3f;
|
||||
vp9_sad_multi1_fn_t sdx8f;
|
||||
vp9_sad_multi_d_fn_t sdx4df;
|
||||
vp9_copy32xn_fn_t copymem;
|
||||
} vp9_variance_fn_ptr_t;
|
||||
|
||||
#endif // VP9_ENCODER_VP9_VARIANCE_H_
|
||||
|
@ -8,85 +8,175 @@
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
%include "third_party/x86inc/x86inc.asm"
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
SECTION .text
|
||||
|
||||
;void vp9_copy32xn_sse2(
|
||||
; unsigned char *src_ptr,
|
||||
; int src_stride,
|
||||
; unsigned char *dst_ptr,
|
||||
; int dst_stride,
|
||||
; int height);
|
||||
global sym(vp9_copy32xn_sse2) PRIVATE
|
||||
sym(vp9_copy32xn_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
SAVE_XMM 7
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref, int ref_stride);
|
||||
INIT_XMM sse2
|
||||
cglobal sad64x64, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
|
||||
movsxdifnidn src_strideq, src_strided
|
||||
movsxdifnidn ref_strideq, ref_strided
|
||||
mov n_rowsd, 64
|
||||
pxor m0, m0
|
||||
.loop:
|
||||
movu m1, [refq]
|
||||
movu m2, [refq+16]
|
||||
movu m3, [refq+32]
|
||||
movu m4, [refq+48]
|
||||
psadbw m1, [srcq]
|
||||
psadbw m2, [srcq+16]
|
||||
psadbw m3, [srcq+32]
|
||||
psadbw m4, [srcq+48]
|
||||
paddd m1, m2
|
||||
paddd m3, m4
|
||||
add refq, ref_strideq
|
||||
paddd m0, m1
|
||||
add srcq, src_strideq
|
||||
paddd m0, m3
|
||||
dec n_rowsd
|
||||
jg .loop
|
||||
|
||||
mov rsi, arg(0) ;src_ptr
|
||||
mov rdi, arg(2) ;dst_ptr
|
||||
movhlps m1, m0
|
||||
paddd m0, m1
|
||||
movd eax, m0
|
||||
RET
|
||||
|
||||
movsxd rax, dword ptr arg(1) ;src_stride
|
||||
movsxd rdx, dword ptr arg(3) ;dst_stride
|
||||
movsxd rcx, dword ptr arg(4) ;height
|
||||
; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref, int ref_stride);
|
||||
INIT_XMM sse2
|
||||
cglobal sad32x32, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
|
||||
movsxdifnidn src_strideq, src_strided
|
||||
movsxdifnidn ref_strideq, ref_strided
|
||||
mov n_rowsd, 16
|
||||
pxor m0, m0
|
||||
|
||||
.block_copy_sse2_loopx4:
|
||||
movdqu xmm0, XMMWORD PTR [rsi]
|
||||
movdqu xmm1, XMMWORD PTR [rsi + 16]
|
||||
movdqu xmm2, XMMWORD PTR [rsi + rax]
|
||||
movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
|
||||
.loop:
|
||||
movu m1, [refq]
|
||||
movu m2, [refq+16]
|
||||
movu m3, [refq+ref_strideq]
|
||||
movu m4, [refq+ref_strideq+16]
|
||||
psadbw m1, [srcq]
|
||||
psadbw m2, [srcq+16]
|
||||
psadbw m3, [srcq+src_strideq]
|
||||
psadbw m4, [srcq+src_strideq+16]
|
||||
paddd m1, m2
|
||||
paddd m3, m4
|
||||
lea refq, [refq+ref_strideq*2]
|
||||
paddd m0, m1
|
||||
lea srcq, [srcq+src_strideq*2]
|
||||
paddd m0, m3
|
||||
dec n_rowsd
|
||||
jg .loop
|
||||
|
||||
lea rsi, [rsi+rax*2]
|
||||
movhlps m1, m0
|
||||
paddd m0, m1
|
||||
movd eax, m0
|
||||
RET
|
||||
|
||||
movdqu xmm4, XMMWORD PTR [rsi]
|
||||
movdqu xmm5, XMMWORD PTR [rsi + 16]
|
||||
movdqu xmm6, XMMWORD PTR [rsi + rax]
|
||||
movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
|
||||
; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref, int ref_stride);
|
||||
%macro SAD16XN 1
|
||||
cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
|
||||
src_stride3, ref_stride3, n_rows
|
||||
movsxdifnidn src_strideq, src_strided
|
||||
movsxdifnidn ref_strideq, ref_strided
|
||||
lea src_stride3q, [src_strideq*3]
|
||||
lea ref_stride3q, [ref_strideq*3]
|
||||
mov n_rowsd, %1/4
|
||||
pxor m0, m0
|
||||
|
||||
lea rsi, [rsi+rax*2]
|
||||
.loop:
|
||||
movu m1, [refq]
|
||||
movu m2, [refq+ref_strideq]
|
||||
movu m3, [refq+ref_strideq*2]
|
||||
movu m4, [refq+ref_stride3q]
|
||||
psadbw m1, [srcq]
|
||||
psadbw m2, [srcq+src_strideq]
|
||||
psadbw m3, [srcq+src_strideq*2]
|
||||
psadbw m4, [srcq+src_stride3q]
|
||||
paddd m1, m2
|
||||
paddd m3, m4
|
||||
lea refq, [refq+ref_strideq*4]
|
||||
paddd m0, m1
|
||||
lea srcq, [srcq+src_strideq*4]
|
||||
paddd m0, m3
|
||||
dec n_rowsd
|
||||
jg .loop
|
||||
|
||||
movdqa XMMWORD PTR [rdi], xmm0
|
||||
movdqa XMMWORD PTR [rdi + 16], xmm1
|
||||
movdqa XMMWORD PTR [rdi + rdx], xmm2
|
||||
movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
|
||||
movhlps m1, m0
|
||||
paddd m0, m1
|
||||
movd eax, m0
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
lea rdi, [rdi+rdx*2]
|
||||
INIT_XMM sse2
|
||||
SAD16XN 16 ; sad16x16_sse2
|
||||
SAD16XN 8 ; sad16x8_sse2
|
||||
|
||||
movdqa XMMWORD PTR [rdi], xmm4
|
||||
movdqa XMMWORD PTR [rdi + 16], xmm5
|
||||
movdqa XMMWORD PTR [rdi + rdx], xmm6
|
||||
movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
|
||||
; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref, int ref_stride);
|
||||
%macro SAD8XN 1
|
||||
cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
|
||||
src_stride3, ref_stride3, n_rows
|
||||
movsxdifnidn src_strideq, src_strided
|
||||
movsxdifnidn ref_strideq, ref_strided
|
||||
lea src_stride3q, [src_strideq*3]
|
||||
lea ref_stride3q, [ref_strideq*3]
|
||||
mov n_rowsd, %1/4
|
||||
pxor m0, m0
|
||||
|
||||
lea rdi, [rdi+rdx*2]
|
||||
.loop:
|
||||
movh m1, [refq]
|
||||
movhps m1, [refq+ref_strideq]
|
||||
movh m2, [refq+ref_strideq*2]
|
||||
movhps m2, [refq+ref_stride3q]
|
||||
movh m3, [srcq]
|
||||
movhps m3, [srcq+src_strideq]
|
||||
movh m4, [srcq+src_strideq*2]
|
||||
movhps m4, [srcq+src_stride3q]
|
||||
psadbw m1, m3
|
||||
psadbw m2, m4
|
||||
lea refq, [refq+ref_strideq*4]
|
||||
paddd m0, m1
|
||||
lea srcq, [srcq+src_strideq*4]
|
||||
paddd m0, m2
|
||||
dec n_rowsd
|
||||
jg .loop
|
||||
|
||||
sub rcx, 4
|
||||
cmp rcx, 4
|
||||
jge .block_copy_sse2_loopx4
|
||||
movhlps m1, m0
|
||||
paddd m0, m1
|
||||
movd eax, m0
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
cmp rcx, 0
|
||||
je .copy_is_done
|
||||
INIT_XMM sse2
|
||||
SAD8XN 16 ; sad8x16_sse2
|
||||
SAD8XN 8 ; sad8x8_sse2
|
||||
|
||||
.block_copy_sse2_loop:
|
||||
movdqu xmm0, XMMWORD PTR [rsi]
|
||||
movdqu xmm1, XMMWORD PTR [rsi + 16]
|
||||
lea rsi, [rsi+rax]
|
||||
|
||||
movdqa XMMWORD PTR [rdi], xmm0
|
||||
movdqa XMMWORD PTR [rdi + 16], xmm1
|
||||
lea rdi, [rdi+rdx]
|
||||
|
||||
sub rcx, 1
|
||||
jne .block_copy_sse2_loop
|
||||
|
||||
.copy_is_done:
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
; unsigned int vp9_sad4x4_sse(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref, int ref_stride);
|
||||
INIT_MMX sse
|
||||
cglobal sad4x4, 4, 4, 8, src, src_stride, ref, ref_stride
|
||||
movsxdifnidn src_strideq, src_strided
|
||||
movsxdifnidn ref_strideq, ref_strided
|
||||
movd m0, [refq]
|
||||
movd m1, [refq+ref_strideq]
|
||||
movd m2, [srcq]
|
||||
movd m3, [srcq+src_strideq]
|
||||
lea refq, [refq+ref_strideq*2]
|
||||
lea srcq, [srcq+src_strideq*2]
|
||||
movd m4, [refq]
|
||||
movd m5, [refq+ref_strideq]
|
||||
movd m6, [srcq]
|
||||
movd m7, [srcq+src_strideq]
|
||||
punpckldq m0, m1
|
||||
punpckldq m2, m3
|
||||
punpckldq m4, m5
|
||||
punpckldq m6, m7
|
||||
psadbw m0, m2
|
||||
psadbw m4, m6
|
||||
paddd m0, m4
|
||||
movd eax, m0
|
||||
RET
|
||||
|
@ -1,182 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
%include "third_party/x86inc/x86inc.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref, int ref_stride);
|
||||
INIT_XMM sse2
|
||||
cglobal sad64x64, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
|
||||
movsxdifnidn src_strideq, src_strided
|
||||
movsxdifnidn ref_strideq, ref_strided
|
||||
mov n_rowsd, 64
|
||||
pxor m0, m0
|
||||
.loop:
|
||||
movu m1, [refq]
|
||||
movu m2, [refq+16]
|
||||
movu m3, [refq+32]
|
||||
movu m4, [refq+48]
|
||||
psadbw m1, [srcq]
|
||||
psadbw m2, [srcq+16]
|
||||
psadbw m3, [srcq+32]
|
||||
psadbw m4, [srcq+48]
|
||||
paddd m1, m2
|
||||
paddd m3, m4
|
||||
add refq, ref_strideq
|
||||
paddd m0, m1
|
||||
add srcq, src_strideq
|
||||
paddd m0, m3
|
||||
dec n_rowsd
|
||||
jg .loop
|
||||
|
||||
movhlps m1, m0
|
||||
paddd m0, m1
|
||||
movd eax, m0
|
||||
RET
|
||||
|
||||
; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref, int ref_stride);
|
||||
INIT_XMM sse2
|
||||
cglobal sad32x32, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
|
||||
movsxdifnidn src_strideq, src_strided
|
||||
movsxdifnidn ref_strideq, ref_strided
|
||||
mov n_rowsd, 16
|
||||
pxor m0, m0
|
||||
|
||||
.loop:
|
||||
movu m1, [refq]
|
||||
movu m2, [refq+16]
|
||||
movu m3, [refq+ref_strideq]
|
||||
movu m4, [refq+ref_strideq+16]
|
||||
psadbw m1, [srcq]
|
||||
psadbw m2, [srcq+16]
|
||||
psadbw m3, [srcq+src_strideq]
|
||||
psadbw m4, [srcq+src_strideq+16]
|
||||
paddd m1, m2
|
||||
paddd m3, m4
|
||||
lea refq, [refq+ref_strideq*2]
|
||||
paddd m0, m1
|
||||
lea srcq, [srcq+src_strideq*2]
|
||||
paddd m0, m3
|
||||
dec n_rowsd
|
||||
jg .loop
|
||||
|
||||
movhlps m1, m0
|
||||
paddd m0, m1
|
||||
movd eax, m0
|
||||
RET
|
||||
|
||||
; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref, int ref_stride);
|
||||
%macro SAD16XN 1
|
||||
cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
|
||||
src_stride3, ref_stride3, n_rows
|
||||
movsxdifnidn src_strideq, src_strided
|
||||
movsxdifnidn ref_strideq, ref_strided
|
||||
lea src_stride3q, [src_strideq*3]
|
||||
lea ref_stride3q, [ref_strideq*3]
|
||||
mov n_rowsd, %1/4
|
||||
pxor m0, m0
|
||||
|
||||
.loop:
|
||||
movu m1, [refq]
|
||||
movu m2, [refq+ref_strideq]
|
||||
movu m3, [refq+ref_strideq*2]
|
||||
movu m4, [refq+ref_stride3q]
|
||||
psadbw m1, [srcq]
|
||||
psadbw m2, [srcq+src_strideq]
|
||||
psadbw m3, [srcq+src_strideq*2]
|
||||
psadbw m4, [srcq+src_stride3q]
|
||||
paddd m1, m2
|
||||
paddd m3, m4
|
||||
lea refq, [refq+ref_strideq*4]
|
||||
paddd m0, m1
|
||||
lea srcq, [srcq+src_strideq*4]
|
||||
paddd m0, m3
|
||||
dec n_rowsd
|
||||
jg .loop
|
||||
|
||||
movhlps m1, m0
|
||||
paddd m0, m1
|
||||
movd eax, m0
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
SAD16XN 16 ; sad16x16_sse2
|
||||
SAD16XN 8 ; sad16x8_sse2
|
||||
|
||||
; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref, int ref_stride);
|
||||
%macro SAD8XN 1
|
||||
cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
|
||||
src_stride3, ref_stride3, n_rows
|
||||
movsxdifnidn src_strideq, src_strided
|
||||
movsxdifnidn ref_strideq, ref_strided
|
||||
lea src_stride3q, [src_strideq*3]
|
||||
lea ref_stride3q, [ref_strideq*3]
|
||||
mov n_rowsd, %1/4
|
||||
pxor m0, m0
|
||||
|
||||
.loop:
|
||||
movh m1, [refq]
|
||||
movhps m1, [refq+ref_strideq]
|
||||
movh m2, [refq+ref_strideq*2]
|
||||
movhps m2, [refq+ref_stride3q]
|
||||
movh m3, [srcq]
|
||||
movhps m3, [srcq+src_strideq]
|
||||
movh m4, [srcq+src_strideq*2]
|
||||
movhps m4, [srcq+src_stride3q]
|
||||
psadbw m1, m3
|
||||
psadbw m2, m4
|
||||
lea refq, [refq+ref_strideq*4]
|
||||
paddd m0, m1
|
||||
lea srcq, [srcq+src_strideq*4]
|
||||
paddd m0, m2
|
||||
dec n_rowsd
|
||||
jg .loop
|
||||
|
||||
movhlps m1, m0
|
||||
paddd m0, m1
|
||||
movd eax, m0
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
SAD8XN 16 ; sad8x16_sse2
|
||||
SAD8XN 8 ; sad8x8_sse2
|
||||
|
||||
; unsigned int vp9_sad4x4_sse(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref, int ref_stride);
|
||||
INIT_MMX sse
|
||||
cglobal sad4x4, 4, 4, 8, src, src_stride, ref, ref_stride
|
||||
movsxdifnidn src_strideq, src_strided
|
||||
movsxdifnidn ref_strideq, ref_strided
|
||||
movd m0, [refq]
|
||||
movd m1, [refq+ref_strideq]
|
||||
movd m2, [srcq]
|
||||
movd m3, [srcq+src_strideq]
|
||||
lea refq, [refq+ref_strideq*2]
|
||||
lea srcq, [srcq+src_strideq*2]
|
||||
movd m4, [refq]
|
||||
movd m5, [refq+ref_strideq]
|
||||
movd m6, [srcq]
|
||||
movd m7, [srcq+src_strideq]
|
||||
punpckldq m0, m1
|
||||
punpckldq m2, m3
|
||||
punpckldq m4, m5
|
||||
punpckldq m6, m7
|
||||
psadbw m0, m2
|
||||
psadbw m4, m6
|
||||
paddd m0, m4
|
||||
movd eax, m0
|
||||
RET
|
@ -376,64 +376,3 @@ sym(vp9_sad4x4x3_sse3):
|
||||
movd [rcx+8], mm7
|
||||
|
||||
STACK_FRAME_DESTROY_X3
|
||||
|
||||
;void vp9_copy32xn_sse3(
|
||||
; unsigned char *src_ptr,
|
||||
; int src_stride,
|
||||
; unsigned char *dst_ptr,
|
||||
; int dst_stride,
|
||||
; int height);
|
||||
global sym(vp9_copy32xn_sse3) PRIVATE
|
||||
sym(vp9_copy32xn_sse3):
|
||||
|
||||
STACK_FRAME_CREATE_X3
|
||||
|
||||
.block_copy_sse3_loopx4:
|
||||
lea end_ptr, [src_ptr+src_stride*2]
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [src_ptr]
|
||||
movdqu xmm1, XMMWORD PTR [src_ptr + 16]
|
||||
movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
|
||||
movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
|
||||
movdqu xmm4, XMMWORD PTR [end_ptr]
|
||||
movdqu xmm5, XMMWORD PTR [end_ptr + 16]
|
||||
movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
|
||||
movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
|
||||
|
||||
lea src_ptr, [src_ptr+src_stride*4]
|
||||
|
||||
lea end_ptr, [ref_ptr+ref_stride*2]
|
||||
|
||||
movdqa XMMWORD PTR [ref_ptr], xmm0
|
||||
movdqa XMMWORD PTR [ref_ptr + 16], xmm1
|
||||
movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
|
||||
movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
|
||||
movdqa XMMWORD PTR [end_ptr], xmm4
|
||||
movdqa XMMWORD PTR [end_ptr + 16], xmm5
|
||||
movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
|
||||
movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
|
||||
|
||||
lea ref_ptr, [ref_ptr+ref_stride*4]
|
||||
|
||||
sub height, 4
|
||||
cmp height, 4
|
||||
jge .block_copy_sse3_loopx4
|
||||
|
||||
;Check to see if there is more rows need to be copied.
|
||||
cmp height, 0
|
||||
je .copy_is_done
|
||||
|
||||
.block_copy_sse3_loop:
|
||||
movdqu xmm0, XMMWORD PTR [src_ptr]
|
||||
movdqu xmm1, XMMWORD PTR [src_ptr + 16]
|
||||
lea src_ptr, [src_ptr+src_stride]
|
||||
|
||||
movdqa XMMWORD PTR [ref_ptr], xmm0
|
||||
movdqa XMMWORD PTR [ref_ptr + 16], xmm1
|
||||
lea ref_ptr, [ref_ptr+ref_stride]
|
||||
|
||||
sub height, 1
|
||||
jne .block_copy_sse3_loop
|
||||
|
||||
.copy_is_done:
|
||||
STACK_FRAME_DESTROY_X3
|
||||
|
@ -94,8 +94,7 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2_yasm.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2_yasm.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm
|
||||
#VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
|
||||
|
Loading…
x
Reference in New Issue
Block a user