vp8 fast quantizer with intrinsics

Reduce dependency on offsets file by using intrinsics. Disassembly shows improvements over previous assembly specifically in register management, preloading, and {pro,epi}log. Speed change is within margin of error. Change-Id: I8131b4b4d62bc092407fe847bfaa8f2c0e1384ff
2013-02-01 16:14:38 -08:00
parent 226c57e4fa
commit ef887974aa
3 changed files with 110 additions and 142 deletions
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -236,147 +236,6 @@ ZIGZAG_LOOP 15
    pop         rbp
    ret
 ; void vp8_fast_quantize_b_sse2 | arg
 ;  (BLOCK  *b,                  |  0
 ;   BLOCKD *d)                  |  1
 global sym(vp8_fast_quantize_b_sse2) PRIVATE
 sym(vp8_fast_quantize_b_sse2):
    push        rbp
    mov         rbp, rsp
    GET_GOT     rbx
 %if ABI_IS_32BIT
    push        rdi
    push        rsi
 %else
  %if LIBVPX_YASM_WIN64
    push        rdi
    push        rsi
  %else
    ; these registers are used for passing arguments
  %endif
 %endif
    ; end prolog
 %if ABI_IS_32BIT
    mov         rdi, arg(0)                 ; BLOCK *b
    mov         rsi, arg(1)                 ; BLOCKD *d
 %else
  %if LIBVPX_YASM_WIN64
    mov         rdi, rcx                    ; BLOCK *b
    mov         rsi, rdx                    ; BLOCKD *d
  %else
    ;mov         rdi, rdi                    ; BLOCK *b
    ;mov         rsi, rsi                    ; BLOCKD *d
  %endif
 %endif
    mov         rax, [rdi + vp8_block_coeff]
    mov         rcx, [rdi + vp8_block_round]
    mov         rdx, [rdi + vp8_block_quant_fast]
    ; z = coeff
    movdqa      xmm0, [rax]
    movdqa      xmm4, [rax + 16]
    ; dup z so we can save sz
    movdqa      xmm1, xmm0
    movdqa      xmm5, xmm4
    ; sz = z >> 15
    psraw       xmm0, 15
    psraw       xmm4, 15
    ; x = abs(z) = (z ^ sz) - sz
    pxor        xmm1, xmm0
    pxor        xmm5, xmm4
    psubw       xmm1, xmm0
    psubw       xmm5, xmm4
    ; x += round
    paddw       xmm1, [rcx]
    paddw       xmm5, [rcx + 16]
    mov         rax, [rsi + vp8_blockd_qcoeff]
    mov         rcx, [rsi + vp8_blockd_dequant]
    mov         rdi, [rsi + vp8_blockd_dqcoeff]
    ; y = x * quant >> 16
    pmulhw      xmm1, [rdx]
    pmulhw      xmm5, [rdx + 16]
    ; x = (y ^ sz) - sz
    pxor        xmm1, xmm0
    pxor        xmm5, xmm4
    psubw       xmm1, xmm0
    psubw       xmm5, xmm4
    ; qcoeff = x
    movdqa      [rax], xmm1
    movdqa      [rax + 16], xmm5
    ; x * dequant
    movdqa      xmm2, xmm1
    movdqa      xmm3, xmm5
    pmullw      xmm2, [rcx]
    pmullw      xmm3, [rcx + 16]
    ; dqcoeff = x * dequant
    movdqa      [rdi], xmm2
    movdqa      [rdi + 16], xmm3
    pxor        xmm4, xmm4                  ;clear all bits
    pcmpeqw     xmm1, xmm4
    pcmpeqw     xmm5, xmm4
    pcmpeqw     xmm4, xmm4                  ;set all bits
    pxor        xmm1, xmm4
    pxor        xmm5, xmm4
    pand        xmm1, [GLOBAL(inv_zig_zag)]
    pand        xmm5, [GLOBAL(inv_zig_zag + 16)]
    pmaxsw      xmm1, xmm5
    mov         rcx, [rsi + vp8_blockd_eob]
    ; now down to 8
    pshufd      xmm5, xmm1, 00001110b
    pmaxsw      xmm1, xmm5
    ; only 4 left
    pshuflw     xmm5, xmm1, 00001110b
    pmaxsw      xmm1, xmm5
    ; okay, just 2!
    pshuflw     xmm5, xmm1, 00000001b
    pmaxsw      xmm1, xmm5
    movd        eax, xmm1
    and         eax, 0xff
    mov         BYTE PTR [rcx], al          ; store eob
    ; begin epilog
 %if ABI_IS_32BIT
    pop         rsi
    pop         rdi
 %else
  %if LIBVPX_YASM_WIN64
    pop         rsi
    pop         rdi
  %endif
 %endif
    RESTORE_GOT
    pop         rbp
    ret
 SECTION_RODATA
 align 16
 inv_zig_zag:
--- a/vp8/encoder/x86/quantize_sse2.c
+++ b/vp8/encoder/x86/quantize_sse2.c
@@ -0,0 +1,103 @@
 /*
 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vp8/common/blockd.h"
 #include "vp8/common/entropy.h"
 #include "vp8/encoder/block.h"
 #include <mmintrin.h> //MMX
 #include <xmmintrin.h> //SSE
 #include <emmintrin.h> //SSE2
 void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
 {
  __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
  __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
  __m128i round0 = _mm_load_si128((__m128i *)(b->round));
  __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
  __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));
  __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));
  __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
  __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
  __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag));
  __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8));
  __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones;
  /* sign of z: z >> 15 */
  sz0 = _mm_srai_epi16(z0, 15);
  sz1 = _mm_srai_epi16(z1, 15);
  /* x = abs(z): (z ^ sz) - sz */
  x0 = _mm_xor_si128(z0, sz0);
  x1 = _mm_xor_si128(z1, sz1);
  x0 = _mm_sub_epi16(x0, sz0);
  x1 = _mm_sub_epi16(x1, sz1);
  /* x += round */
  x0 = _mm_add_epi16(x0, round0);
  x1 = _mm_add_epi16(x1, round1);
  /* y = (x * quant) >> 16 */
  y0 = _mm_mulhi_epi16(x0, quant_fast0);
  y1 = _mm_mulhi_epi16(x1, quant_fast1);
  /* x = abs(y) = (y ^ sz) - sz */
  y0 = _mm_xor_si128(y0, sz0);
  y1 = _mm_xor_si128(y1, sz1);
  x0 = _mm_sub_epi16(y0, sz0);
  x1 = _mm_sub_epi16(y1, sz1);
  /* qcoeff = x */
  _mm_store_si128((__m128i *)(d->qcoeff), x0);
  _mm_store_si128((__m128i *)(d->qcoeff + 8), x1);
  /* x * dequant */
  xdq0 = _mm_mullo_epi16(x0, dequant0);
  xdq1 = _mm_mullo_epi16(x1, dequant1);
  /* dqcoeff = x * dequant */
  _mm_store_si128((__m128i *)(d->dqcoeff), xdq0);
  _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1);
  /* build a mask for the zig zag */
  zeros = _mm_setzero_si128();
  x0 = _mm_cmpeq_epi16(x0, zeros);
  x1 = _mm_cmpeq_epi16(x1, zeros);
  ones = _mm_cmpeq_epi16(zeros, zeros);
  x0 = _mm_xor_si128(x0, ones);
  x1 = _mm_xor_si128(x1, ones);
  x0 = _mm_and_si128(x0, inv_zig_zag0);
  x1 = _mm_and_si128(x1, inv_zig_zag1);
  x0 = _mm_max_epi16(x0, x1);
  /* now down to 8 */
  x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110
  x0 = _mm_max_epi16(x0, x1);
  /* only 4 left */
  x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110
  x0 = _mm_max_epi16(x0, x1);
  /* okay, just 2! */
  x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001
  x0 = _mm_max_epi16(x0, x1);
  *d->eob = 0xFF & _mm_cvtsi128_si32(x0);
 }
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -89,8 +89,15 @@ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
 # TODO(johann) make this generic
 ifeq ($(HAVE_SSE2),yes)
 vp8/encoder/x86/quantize_sse2.c.o: CFLAGS += -msse2
 vp8/encoder/x86/quantize_sse2.c.d: CFLAGS += -msse2
 endif
 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
 ifeq ($(HAVE_SSE2),yes)
@@ -112,7 +119,6 @@ ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 endif
 VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
 $(eval $(call asm_offsets_template,\