Merge "SSE2 code for the filter in MFQE."

2015-01-23 11:08:16 -08:00 · 2015-01-23 11:08:16 -08:00 · 65f60f8e8c
commit 65f60f8e8c
parent 0e2e2c2652 09673deba9
4 changed files with 317 additions and 11 deletions
--- a/vp9/common/vp9_mfqe.c
+++ b/vp9/common/vp9_mfqe.c
@ -35,14 +35,26 @@ static void filter_by_weight(const uint8_t *src, int src_stride,
  }
 }
 void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride,
                               uint8_t *dst, int dst_stride, int src_weight) {
  filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
 }
 void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride,
                                 uint8_t *dst, int dst_stride,
                                 int src_weight) {
  filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
 }
 static void filter_by_weight32x32(const uint8_t *src, int src_stride,
                                  uint8_t *dst, int dst_stride, int weight) {
-  filter_by_weight(src, src_stride, dst, dst_stride, 16, weight);
+  vp9_filter_by_weight16x16(src, src_stride, dst, dst_stride, weight);
-  filter_by_weight(src + 16, src_stride, dst + 16, dst_stride, 16, weight);
+  vp9_filter_by_weight16x16(src + 16, src_stride, dst + 16, dst_stride,
-  filter_by_weight(src + src_stride * 16, src_stride, dst + dst_stride * 16,
+                            weight);
-                   dst_stride, 16, weight);
+  vp9_filter_by_weight16x16(src + src_stride * 16, src_stride,
-  filter_by_weight(src + src_stride * 16 + 16, src_stride,
+                            dst + dst_stride * 16, dst_stride, weight);
-                   dst + dst_stride * 16 + 16, dst_stride, 16, weight);
+  vp9_filter_by_weight16x16(src + src_stride * 16 + 16, src_stride,
                            dst + dst_stride * 16 + 16, dst_stride, weight);
 }
 static void filter_by_weight64x64(const uint8_t *src, int src_stride,
@ -62,13 +74,13 @@ static void apply_ifactor(const uint8_t *y, int y_stride, uint8_t *yd,
                          int uvd_stride, BLOCK_SIZE block_size,
                          int weight) {
  if (block_size == BLOCK_16X16) {
-    filter_by_weight(y, y_stride, yd, yd_stride, 16, weight);
+    vp9_filter_by_weight16x16(y, y_stride, yd, yd_stride, weight);
-    filter_by_weight(u, uv_stride, ud, uvd_stride, 8, weight);
+    vp9_filter_by_weight8x8(u, uv_stride, ud, uvd_stride, weight);
-    filter_by_weight(v, uv_stride, vd, uvd_stride, 8, weight);
+    vp9_filter_by_weight8x8(v, uv_stride, vd, uvd_stride, weight);
  } else if (block_size == BLOCK_32X32) {
    filter_by_weight32x32(y, y_stride, yd, yd_stride, weight);
-    filter_by_weight(u, uv_stride, ud, uvd_stride, 16, weight);
+    vp9_filter_by_weight16x16(u, uv_stride, ud, uvd_stride, weight);
-    filter_by_weight(v, uv_stride, vd, uvd_stride, 16, weight);
+    vp9_filter_by_weight16x16(v, uv_stride, vd, uvd_stride, weight);
  } else if (block_size == BLOCK_64X64) {
    filter_by_weight64x64(y, y_stride, yd, yd_stride, weight);
    filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight);
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@ -274,6 +274,12 @@ $vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;
 add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
 specialize qw/vp9_plane_add_noise sse2/;
 $vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
 add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
 specialize qw/vp9_filter_by_weight16x16 sse2/;
 add_proto qw/void vp9_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
 specialize qw/vp9_filter_by_weight8x8 sse2/;
 }
 #
--- a/vp9/common/x86/vp9_mfqe_sse2.asm
+++ b/vp9/common/x86/vp9_mfqe_sse2.asm
@ -0,0 +1,287 @@
 ;
 ;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 ;  This file is a duplicate of mfqe_sse2.asm in VP8.
 ;  TODO(jackychen): Find a way to fix the duplicate.
 %include "vpx_ports/x86_abi_support.asm"
 ;void vp9_filter_by_weight16x16_sse2
 ;(
 ;    unsigned char *src,
 ;    int            src_stride,
 ;    unsigned char *dst,
 ;    int            dst_stride,
 ;    int            src_weight
 ;)
 global sym(vp9_filter_by_weight16x16_sse2) PRIVATE
 sym(vp9_filter_by_weight16x16_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    SAVE_XMM 6
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
    movd        xmm0, arg(4)                ; src_weight
    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
    movdqa      xmm1, [GLOBAL(tMFQE)]
    psubw       xmm1, xmm0                  ; dst_weight
    mov         rax, arg(0)                 ; src
    mov         rsi, arg(1)                 ; src_stride
    mov         rdx, arg(2)                 ; dst
    mov         rdi, arg(3)                 ; dst_stride
    mov         rcx, 16                     ; loop count
    pxor        xmm6, xmm6
 .combine
    movdqa      xmm2, [rax]
    movdqa      xmm4, [rdx]
    add         rax, rsi
    ; src * src_weight
    movdqa      xmm3, xmm2
    punpcklbw   xmm2, xmm6
    punpckhbw   xmm3, xmm6
    pmullw      xmm2, xmm0
    pmullw      xmm3, xmm0
    ; dst * dst_weight
    movdqa      xmm5, xmm4
    punpcklbw   xmm4, xmm6
    punpckhbw   xmm5, xmm6
    pmullw      xmm4, xmm1
    pmullw      xmm5, xmm1
    ; sum, round and shift
    paddw       xmm2, xmm4
    paddw       xmm3, xmm5
    paddw       xmm2, [GLOBAL(tMFQE_round)]
    paddw       xmm3, [GLOBAL(tMFQE_round)]
    psrlw       xmm2, 4
    psrlw       xmm3, 4
    packuswb    xmm2, xmm3
    movdqa      [rdx], xmm2
    add         rdx, rdi
    dec         rcx
    jnz         .combine
    ; begin epilog
    pop         rdi
    pop         rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void vp9_filter_by_weight8x8_sse2
 ;(
 ;    unsigned char *src,
 ;    int            src_stride,
 ;    unsigned char *dst,
 ;    int            dst_stride,
 ;    int            src_weight
 ;)
 global sym(vp9_filter_by_weight8x8_sse2) PRIVATE
 sym(vp9_filter_by_weight8x8_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
    movd        xmm0, arg(4)                ; src_weight
    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
    movdqa      xmm1, [GLOBAL(tMFQE)]
    psubw       xmm1, xmm0                  ; dst_weight
    mov         rax, arg(0)                 ; src
    mov         rsi, arg(1)                 ; src_stride
    mov         rdx, arg(2)                 ; dst
    mov         rdi, arg(3)                 ; dst_stride
    mov         rcx, 8                      ; loop count
    pxor        xmm4, xmm4
 .combine
    movq        xmm2, [rax]
    movq        xmm3, [rdx]
    add         rax, rsi
    ; src * src_weight
    punpcklbw   xmm2, xmm4
    pmullw      xmm2, xmm0
    ; dst * dst_weight
    punpcklbw   xmm3, xmm4
    pmullw      xmm3, xmm1
    ; sum, round and shift
    paddw       xmm2, xmm3
    paddw       xmm2, [GLOBAL(tMFQE_round)]
    psrlw       xmm2, 4
    packuswb    xmm2, xmm4
    movq        [rdx], xmm2
    add         rdx, rdi
    dec         rcx
    jnz         .combine
    ; begin epilog
    pop         rdi
    pop         rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void vp9_variance_and_sad_16x16_sse2 | arg
 ;(
 ;    unsigned char *src1,          0
 ;    int            stride1,       1
 ;    unsigned char *src2,          2
 ;    int            stride2,       3
 ;    unsigned int  *variance,      4
 ;    unsigned int  *sad,           5
 ;)
 global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE
 sym(vp9_variance_and_sad_16x16_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
    mov         rax,        arg(0)          ; src1
    mov         rcx,        arg(1)          ; stride1
    mov         rdx,        arg(2)          ; src2
    mov         rdi,        arg(3)          ; stride2
    mov         rsi,        16              ; block height
    ; Prep accumulator registers
    pxor        xmm3, xmm3                  ; SAD
    pxor        xmm4, xmm4                  ; sum of src2
    pxor        xmm5, xmm5                  ; sum of src2^2
    ; Because we're working with the actual output frames
    ; we can't depend on any kind of data alignment.
 .accumulate
    movdqa      xmm0, [rax]                 ; src1
    movdqa      xmm1, [rdx]                 ; src2
    add         rax, rcx                    ; src1 + stride1
    add         rdx, rdi                    ; src2 + stride2
    ; SAD(src1, src2)
    psadbw      xmm0, xmm1
    paddusw     xmm3, xmm0
    ; SUM(src2)
    pxor        xmm2, xmm2
    psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
    paddusw     xmm4, xmm2
    ; pmaddubsw would be ideal if it took two unsigned values. instead,
    ; it expects a signed and an unsigned value. so instead we zero extend
    ; and operate on words.
    pxor        xmm2, xmm2
    movdqa      xmm0, xmm1
    punpcklbw   xmm0, xmm2
    punpckhbw   xmm1, xmm2
    pmaddwd     xmm0, xmm0
    pmaddwd     xmm1, xmm1
    paddd       xmm5, xmm0
    paddd       xmm5, xmm1
    sub         rsi,        1
    jnz         .accumulate
    ; phaddd only operates on adjacent double words.
    ; Finalize SAD and store
    movdqa      xmm0, xmm3
    psrldq      xmm0, 8
    paddusw     xmm0, xmm3
    paddd       xmm0, [GLOBAL(t128)]
    psrld       xmm0, 8
    mov         rax,  arg(5)
    movd        [rax], xmm0
    ; Accumulate sum of src2
    movdqa      xmm0, xmm4
    psrldq      xmm0, 8
    paddusw     xmm0, xmm4
    ; Square src2. Ignore high value
    pmuludq     xmm0, xmm0
    psrld       xmm0, 8
    ; phaddw could be used to sum adjacent values but we want
    ; all the values summed. promote to doubles, accumulate,
    ; shift and sum
    pxor        xmm2, xmm2
    movdqa      xmm1, xmm5
    punpckldq   xmm1, xmm2
    punpckhdq   xmm5, xmm2
    paddd       xmm1, xmm5
    movdqa      xmm2, xmm1
    psrldq      xmm1, 8
    paddd       xmm1, xmm2
    psubd       xmm1, xmm0
    ; (variance + 128) >> 8
    paddd       xmm1, [GLOBAL(t128)]
    psrld       xmm1, 8
    mov         rax,  arg(4)
    movd        [rax], xmm1
    ; begin epilog
    pop         rdi
    pop         rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 SECTION_RODATA
 align 16
 t128:
 %ifndef __NASM_VER__
    ddq 128
 %elif CONFIG_BIG_ENDIAN
    dq  0, 128
 %else
    dq  128, 0
 %endif
 align 16
 tMFQE: ; 1 << MFQE_PRECISION
    times 8 dw 0x10
 align 16
 tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
    times 8 dw 0x08
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@ -84,6 +84,7 @@ VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm
 VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
 endif