Remove asm_offsets dependency in quantize_b_ssse3

Replace it with some intrinsic code and inline assembly. Change-Id: I81b4df146db3d01039059be7dae31083e2943b97
2013-04-03 14:49:30 -07:00 · 2013-04-03 14:49:30 -07:00 · 570d43c020
commit 570d43c020
parent 0f1a3461d6
3 changed files with 111 additions and 139 deletions
--- a/vp8/encoder/x86/quantize_ssse3.asm
+++ b/vp8/encoder/x86/quantize_ssse3.asm
@ -1,138 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%include "vp8_asm_enc_offsets.asm"
-
-
-; void vp8_fast_quantize_b_ssse3 | arg
-;  (BLOCK  *b,                   |  0
-;   BLOCKD *d)                   |  1
-;
-
-global sym(vp8_fast_quantize_b_ssse3) PRIVATE
-sym(vp8_fast_quantize_b_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %if LIBVPX_YASM_WIN64
-    push        rdi
-    push        rsi
-  %endif
-%endif
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp8_block_coeff]
-    mov         rcx, [rdi + vp8_block_round]
-    mov         rdx, [rdi + vp8_block_quant_fast]
-
-    ; coeff
-    movdqa      xmm0, [rax]
-    movdqa      xmm4, [rax + 16]
-
-    ; round
-    movdqa      xmm2, [rcx]
-    movdqa      xmm3, [rcx + 16]
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz = z >> 15
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    pabsw       xmm1, xmm1
-    pabsw       xmm5, xmm5
-
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm3
-
-    ; quant_fast
-    pmulhw      xmm1, [rdx]
-    pmulhw      xmm5, [rdx + 16]
-
-    mov         rax, [rsi + vp8_blockd_qcoeff]
-    mov         rdi, [rsi + vp8_blockd_dequant]
-    mov         rcx, [rsi + vp8_blockd_dqcoeff]
-
-    movdqa      xmm2, xmm1                  ;store y for getting eob
-    movdqa      xmm3, xmm5
-
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    movdqa      [rax], xmm1
-    movdqa      [rax + 16], xmm5
-
-    movdqa      xmm0, [rdi]
-    movdqa      xmm4, [rdi + 16]
-
-    pmullw      xmm0, xmm1
-    pmullw      xmm4, xmm5
-    pxor        xmm1, xmm1
-
-    pcmpgtw     xmm2, xmm1                  ;calculate eob
-    pcmpgtw     xmm3, xmm1
-    packsswb    xmm2, xmm3
-    pshufb      xmm2, [GLOBAL(zz_shuf)]
-
-    pmovmskb    edx, xmm2
-
-    movdqa      [rcx], xmm0                 ;store dqcoeff
-    movdqa      [rcx + 16], xmm4            ;store dqcoeff
-    mov         rcx, [rsi + vp8_blockd_eob]
-
-    bsr         eax, edx                    ;count 0
-    add         eax, 1
-
-    cmp         edx, 0                      ;if all 0, eob=0
-    cmove       eax, edx
-
-    mov         BYTE PTR [rcx], al          ;store eob
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-
-    RESTORE_GOT
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-zz_shuf:
-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
--- a/vp8/encoder/x86/quantize_ssse3.c
+++ b/vp8/encoder/x86/quantize_ssse3.c
@ -0,0 +1,110 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h> /* SSSE3 */
+
+#include "vp8/encoder/block.h"
+
+/* bitscan reverse (bsr) */
+#if defined(_MSC_VER)
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+static int bsr(int mask) {
+  int eob;
+  _BitScanReverse(&eob, mask);
+  eob++;
+  if (mask == 0)
+    eob = 0;
+  return eob;
+}
+#else
+static int bsr(int mask) {
+  int eob;
+  asm volatile("bsr %1, %0" : "=r" (eob) : "r" (mask) : "flags");
+  eob++;
+  if (mask == 0)
+    eob = 0;
+  return eob;
+}
+#endif
+
+void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) {
+  int eob, mask;
+
+  __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
+  __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
+  __m128i round0 = _mm_load_si128((__m128i *)(b->round));
+  __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
+  __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));
+  __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));
+  __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
+  __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
+
+  __m128i sz0, sz1, x, x0, x1, y0, y1, zeros, abs0, abs1;
+
+  DECLARE_ALIGNED(16, const uint8_t, pshufb_zig_zag_mask[16]) =
+    { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 };
+  __m128i zig_zag = _mm_load_si128((const __m128i *)pshufb_zig_zag_mask);
+
+  /* sign of z: z >> 15 */
+  sz0 = _mm_srai_epi16(z0, 15);
+  sz1 = _mm_srai_epi16(z1, 15);
+
+  /* x = abs(z) */
+  x0 = _mm_abs_epi16(z0);
+  x1 = _mm_abs_epi16(z1);
+
+  /* x += round */
+  x0 = _mm_add_epi16(x0, round0);
+  x1 = _mm_add_epi16(x1, round1);
+
+  /* y = (x * quant) >> 16 */
+  y0 = _mm_mulhi_epi16(x0, quant_fast0);
+  y1 = _mm_mulhi_epi16(x1, quant_fast1);
+
+  /* ASM saves Y for EOB */
+  /* I think we can ignore that because adding the sign doesn't change anything
+   * and multiplying 0 by dequant is OK as well */
+  abs0 = y0;
+  abs1 = y1;
+
+  /* Restore the sign bit. */
+  y0 = _mm_xor_si128(y0, sz0);
+  y1 = _mm_xor_si128(y1, sz1);
+  x0 = _mm_sub_epi16(y0, sz0);
+  x1 = _mm_sub_epi16(y1, sz1);
+
+  /* qcoeff = x */
+  _mm_store_si128((__m128i *)(d->qcoeff), x0);
+  _mm_store_si128((__m128i *)(d->qcoeff + 8), x1);
+
+  /* x * dequant */
+  x0 = _mm_mullo_epi16(x0, dequant0);
+  x1 = _mm_mullo_epi16(x1, dequant1);
+
+  /* dqcoeff = x * dequant */
+  _mm_store_si128((__m128i *)(d->dqcoeff), x0);
+  _mm_store_si128((__m128i *)(d->dqcoeff + 8), x1);
+
+  zeros = _mm_setzero_si128();
+
+  x0 = _mm_cmpgt_epi16(abs0, zeros);
+  x1 = _mm_cmpgt_epi16(abs1, zeros);
+
+  x = _mm_packs_epi16(x0, x1);
+
+  x = _mm_shuffle_epi8(x, zig_zag);
+
+  mask = _mm_movemask_epi8(x);
+
+  eob = bsr(mask);
+
+  *d->eob = 0xFF & eob;
+}
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@ -88,6 +88,7 @@ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.c

 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
@ -96,7 +97,6 @@ endif
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
 VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm