SSE2 code for the filter in MFQE.
The SSE2 code is from VP8 MFQE, reuse it in VP9. No change on VP8 side. In our testing, we achieve 2X speed by adopting this change. Change-Id: Ib2b14144ae57c892005c1c4b84e3379d02e56716
This commit is contained in:
parent
fe3f21099f
commit
09673deba9
@ -35,14 +35,26 @@ static void filter_by_weight(const uint8_t *src, int src_stride,
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride, int src_weight) {
|
||||
filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
|
||||
}
|
||||
|
||||
void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
int src_weight) {
|
||||
filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
|
||||
}
|
||||
|
||||
static void filter_by_weight32x32(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride, int weight) {
|
||||
filter_by_weight(src, src_stride, dst, dst_stride, 16, weight);
|
||||
filter_by_weight(src + 16, src_stride, dst + 16, dst_stride, 16, weight);
|
||||
filter_by_weight(src + src_stride * 16, src_stride, dst + dst_stride * 16,
|
||||
dst_stride, 16, weight);
|
||||
filter_by_weight(src + src_stride * 16 + 16, src_stride,
|
||||
dst + dst_stride * 16 + 16, dst_stride, 16, weight);
|
||||
vp9_filter_by_weight16x16(src, src_stride, dst, dst_stride, weight);
|
||||
vp9_filter_by_weight16x16(src + 16, src_stride, dst + 16, dst_stride,
|
||||
weight);
|
||||
vp9_filter_by_weight16x16(src + src_stride * 16, src_stride,
|
||||
dst + dst_stride * 16, dst_stride, weight);
|
||||
vp9_filter_by_weight16x16(src + src_stride * 16 + 16, src_stride,
|
||||
dst + dst_stride * 16 + 16, dst_stride, weight);
|
||||
}
|
||||
|
||||
static void filter_by_weight64x64(const uint8_t *src, int src_stride,
|
||||
@ -62,13 +74,13 @@ static void apply_ifactor(const uint8_t *y, int y_stride, uint8_t *yd,
|
||||
int uvd_stride, BLOCK_SIZE block_size,
|
||||
int weight) {
|
||||
if (block_size == BLOCK_16X16) {
|
||||
filter_by_weight(y, y_stride, yd, yd_stride, 16, weight);
|
||||
filter_by_weight(u, uv_stride, ud, uvd_stride, 8, weight);
|
||||
filter_by_weight(v, uv_stride, vd, uvd_stride, 8, weight);
|
||||
vp9_filter_by_weight16x16(y, y_stride, yd, yd_stride, weight);
|
||||
vp9_filter_by_weight8x8(u, uv_stride, ud, uvd_stride, weight);
|
||||
vp9_filter_by_weight8x8(v, uv_stride, vd, uvd_stride, weight);
|
||||
} else if (block_size == BLOCK_32X32) {
|
||||
filter_by_weight32x32(y, y_stride, yd, yd_stride, weight);
|
||||
filter_by_weight(u, uv_stride, ud, uvd_stride, 16, weight);
|
||||
filter_by_weight(v, uv_stride, vd, uvd_stride, 16, weight);
|
||||
vp9_filter_by_weight16x16(u, uv_stride, ud, uvd_stride, weight);
|
||||
vp9_filter_by_weight16x16(v, uv_stride, vd, uvd_stride, weight);
|
||||
} else if (block_size == BLOCK_64X64) {
|
||||
filter_by_weight64x64(y, y_stride, yd, yd_stride, weight);
|
||||
filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight);
|
||||
|
@ -274,6 +274,12 @@ $vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;
|
||||
add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
|
||||
specialize qw/vp9_plane_add_noise sse2/;
|
||||
$vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
|
||||
|
||||
add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
|
||||
specialize qw/vp9_filter_by_weight16x16 sse2/;
|
||||
|
||||
add_proto qw/void vp9_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
|
||||
specialize qw/vp9_filter_by_weight8x8 sse2/;
|
||||
}
|
||||
|
||||
#
|
||||
|
287
vp9/common/x86/vp9_mfqe_sse2.asm
Normal file
287
vp9/common/x86/vp9_mfqe_sse2.asm
Normal file
@ -0,0 +1,287 @@
|
||||
;
|
||||
; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
; This file is a duplicate of mfqe_sse2.asm in VP8.
|
||||
; TODO(jackychen): Find a way to fix the duplicate.
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
;void vp9_filter_by_weight16x16_sse2
|
||||
;(
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *dst,
|
||||
; int dst_stride,
|
||||
; int src_weight
|
||||
;)
|
||||
global sym(vp9_filter_by_weight16x16_sse2) PRIVATE
|
||||
sym(vp9_filter_by_weight16x16_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
SAVE_XMM 6
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
movd xmm0, arg(4) ; src_weight
|
||||
pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
|
||||
punpcklqdq xmm0, xmm0 ; replicate to all hi words
|
||||
|
||||
movdqa xmm1, [GLOBAL(tMFQE)]
|
||||
psubw xmm1, xmm0 ; dst_weight
|
||||
|
||||
mov rax, arg(0) ; src
|
||||
mov rsi, arg(1) ; src_stride
|
||||
mov rdx, arg(2) ; dst
|
||||
mov rdi, arg(3) ; dst_stride
|
||||
|
||||
mov rcx, 16 ; loop count
|
||||
pxor xmm6, xmm6
|
||||
|
||||
.combine
|
||||
movdqa xmm2, [rax]
|
||||
movdqa xmm4, [rdx]
|
||||
add rax, rsi
|
||||
|
||||
; src * src_weight
|
||||
movdqa xmm3, xmm2
|
||||
punpcklbw xmm2, xmm6
|
||||
punpckhbw xmm3, xmm6
|
||||
pmullw xmm2, xmm0
|
||||
pmullw xmm3, xmm0
|
||||
|
||||
; dst * dst_weight
|
||||
movdqa xmm5, xmm4
|
||||
punpcklbw xmm4, xmm6
|
||||
punpckhbw xmm5, xmm6
|
||||
pmullw xmm4, xmm1
|
||||
pmullw xmm5, xmm1
|
||||
|
||||
; sum, round and shift
|
||||
paddw xmm2, xmm4
|
||||
paddw xmm3, xmm5
|
||||
paddw xmm2, [GLOBAL(tMFQE_round)]
|
||||
paddw xmm3, [GLOBAL(tMFQE_round)]
|
||||
psrlw xmm2, 4
|
||||
psrlw xmm3, 4
|
||||
|
||||
packuswb xmm2, xmm3
|
||||
movdqa [rdx], xmm2
|
||||
add rdx, rdi
|
||||
|
||||
dec rcx
|
||||
jnz .combine
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
|
||||
ret
|
||||
|
||||
;void vp9_filter_by_weight8x8_sse2
|
||||
;(
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *dst,
|
||||
; int dst_stride,
|
||||
; int src_weight
|
||||
;)
|
||||
global sym(vp9_filter_by_weight8x8_sse2) PRIVATE
|
||||
sym(vp9_filter_by_weight8x8_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
movd xmm0, arg(4) ; src_weight
|
||||
pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
|
||||
punpcklqdq xmm0, xmm0 ; replicate to all hi words
|
||||
|
||||
movdqa xmm1, [GLOBAL(tMFQE)]
|
||||
psubw xmm1, xmm0 ; dst_weight
|
||||
|
||||
mov rax, arg(0) ; src
|
||||
mov rsi, arg(1) ; src_stride
|
||||
mov rdx, arg(2) ; dst
|
||||
mov rdi, arg(3) ; dst_stride
|
||||
|
||||
mov rcx, 8 ; loop count
|
||||
pxor xmm4, xmm4
|
||||
|
||||
.combine
|
||||
movq xmm2, [rax]
|
||||
movq xmm3, [rdx]
|
||||
add rax, rsi
|
||||
|
||||
; src * src_weight
|
||||
punpcklbw xmm2, xmm4
|
||||
pmullw xmm2, xmm0
|
||||
|
||||
; dst * dst_weight
|
||||
punpcklbw xmm3, xmm4
|
||||
pmullw xmm3, xmm1
|
||||
|
||||
; sum, round and shift
|
||||
paddw xmm2, xmm3
|
||||
paddw xmm2, [GLOBAL(tMFQE_round)]
|
||||
psrlw xmm2, 4
|
||||
|
||||
packuswb xmm2, xmm4
|
||||
movq [rdx], xmm2
|
||||
add rdx, rdi
|
||||
|
||||
dec rcx
|
||||
jnz .combine
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
|
||||
ret
|
||||
|
||||
;void vp9_variance_and_sad_16x16_sse2 | arg
|
||||
;(
|
||||
; unsigned char *src1, 0
|
||||
; int stride1, 1
|
||||
; unsigned char *src2, 2
|
||||
; int stride2, 3
|
||||
; unsigned int *variance, 4
|
||||
; unsigned int *sad, 5
|
||||
;)
|
||||
global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE
|
||||
sym(vp9_variance_and_sad_16x16_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rax, arg(0) ; src1
|
||||
mov rcx, arg(1) ; stride1
|
||||
mov rdx, arg(2) ; src2
|
||||
mov rdi, arg(3) ; stride2
|
||||
|
||||
mov rsi, 16 ; block height
|
||||
|
||||
; Prep accumulator registers
|
||||
pxor xmm3, xmm3 ; SAD
|
||||
pxor xmm4, xmm4 ; sum of src2
|
||||
pxor xmm5, xmm5 ; sum of src2^2
|
||||
|
||||
; Because we're working with the actual output frames
|
||||
; we can't depend on any kind of data alignment.
|
||||
.accumulate
|
||||
movdqa xmm0, [rax] ; src1
|
||||
movdqa xmm1, [rdx] ; src2
|
||||
add rax, rcx ; src1 + stride1
|
||||
add rdx, rdi ; src2 + stride2
|
||||
|
||||
; SAD(src1, src2)
|
||||
psadbw xmm0, xmm1
|
||||
paddusw xmm3, xmm0
|
||||
|
||||
; SUM(src2)
|
||||
pxor xmm2, xmm2
|
||||
psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0
|
||||
paddusw xmm4, xmm2
|
||||
|
||||
; pmaddubsw would be ideal if it took two unsigned values. instead,
|
||||
; it expects a signed and an unsigned value. so instead we zero extend
|
||||
; and operate on words.
|
||||
pxor xmm2, xmm2
|
||||
movdqa xmm0, xmm1
|
||||
punpcklbw xmm0, xmm2
|
||||
punpckhbw xmm1, xmm2
|
||||
pmaddwd xmm0, xmm0
|
||||
pmaddwd xmm1, xmm1
|
||||
paddd xmm5, xmm0
|
||||
paddd xmm5, xmm1
|
||||
|
||||
sub rsi, 1
|
||||
jnz .accumulate
|
||||
|
||||
; phaddd only operates on adjacent double words.
|
||||
; Finalize SAD and store
|
||||
movdqa xmm0, xmm3
|
||||
psrldq xmm0, 8
|
||||
paddusw xmm0, xmm3
|
||||
paddd xmm0, [GLOBAL(t128)]
|
||||
psrld xmm0, 8
|
||||
|
||||
mov rax, arg(5)
|
||||
movd [rax], xmm0
|
||||
|
||||
; Accumulate sum of src2
|
||||
movdqa xmm0, xmm4
|
||||
psrldq xmm0, 8
|
||||
paddusw xmm0, xmm4
|
||||
; Square src2. Ignore high value
|
||||
pmuludq xmm0, xmm0
|
||||
psrld xmm0, 8
|
||||
|
||||
; phaddw could be used to sum adjacent values but we want
|
||||
; all the values summed. promote to doubles, accumulate,
|
||||
; shift and sum
|
||||
pxor xmm2, xmm2
|
||||
movdqa xmm1, xmm5
|
||||
punpckldq xmm1, xmm2
|
||||
punpckhdq xmm5, xmm2
|
||||
paddd xmm1, xmm5
|
||||
movdqa xmm2, xmm1
|
||||
psrldq xmm1, 8
|
||||
paddd xmm1, xmm2
|
||||
|
||||
psubd xmm1, xmm0
|
||||
|
||||
; (variance + 128) >> 8
|
||||
paddd xmm1, [GLOBAL(t128)]
|
||||
psrld xmm1, 8
|
||||
mov rax, arg(4)
|
||||
|
||||
movd [rax], xmm1
|
||||
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
SECTION_RODATA
|
||||
align 16
|
||||
t128:
|
||||
%ifndef __NASM_VER__
|
||||
ddq 128
|
||||
%elif CONFIG_BIG_ENDIAN
|
||||
dq 0, 128
|
||||
%else
|
||||
dq 128, 0
|
||||
%endif
|
||||
align 16
|
||||
tMFQE: ; 1 << MFQE_PRECISION
|
||||
times 8 dw 0x10
|
||||
align 16
|
||||
tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
|
||||
times 8 dw 0x08
|
@ -82,6 +82,7 @@ VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm
|
||||
VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
|
||||
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
|
||||
ifeq ($(CONFIG_VP9_POSTPROC),yes)
|
||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm
|
||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
|
||||
endif
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user