Use intrinsics for sse2 regular quantize

Remove dependency of this function on asm_offsets. ssse3/sse4 next. Change quant_shift calculation so it be done using SIMD. Pre-calculate as much as possible to simplify EOB selection. Take advantage of qcoeff being zero'd by tying the if statements together. Speed parity with previous implementation with gcc x86_64 linux Change-Id: Ife97556a1eca3a74b09def1a3d04084974dff1fb
2013-02-27 17:27:59 -08:00 · 2013-02-27 17:27:59 -08:00 · eca59cad0b
commit eca59cad0b
parent ef887974aa
7 changed files with 151 additions and 265 deletions
--- a/vp8/common/rtcd_defs.sh
+++ b/vp8/common/rtcd_defs.sh
@ -444,8 +444,9 @@ vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6
 # Quantizer
 #
 prototype void vp8_regular_quantize_b "struct block *, struct blockd *"
-specialize vp8_regular_quantize_b sse2 sse4_1
-vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4
+specialize vp8_regular_quantize_b sse2 #sse4_1
+# TODO(johann) Update sse4 implementation and re-enable
+#vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4

 prototype void vp8_fast_quantize_b "struct block *, struct blockd *"
 specialize vp8_fast_quantize_b sse2 ssse3 media neon
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@ -37,7 +37,7 @@ typedef struct block
    /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
    short *quant;
    short *quant_fast;
-    unsigned char *quant_shift;
+    short *quant_shift;
    short *zbin;
    short *zrun_zbin_boost;
    short *round;
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@ -282,17 +282,17 @@ typedef struct VP8_COMP
 {

    DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y1quant_shift[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);

    DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y2quant_shift[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]);

    DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, UVquant_shift[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);

--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@ -50,8 +50,8 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
        if (x >= zbin)
        {
            x += round_ptr[rc];
-            y  = (((x * quant_ptr[rc]) >> 16) + x)
-                 >> quant_shift_ptr[rc];             /* quantize (x) */
+            y  = ((((x * quant_ptr[rc]) >> 16) + x)
+                 * quant_shift_ptr[rc]) >> 16;       /* quantize (x) */
            x  = (y ^ sz) - sz;                      /* get the sign back */
            qcoeff_ptr[rc] = x;                      /* write to destination */
            dqcoeff_ptr[rc] = x * dequant_ptr[rc];   /* dequantized value */
@ -113,7 +113,7 @@ void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
    short *zbin_ptr        = b->zbin;
    short *round_ptr       = b->round;
    short *quant_ptr       = b->quant;
-    unsigned char *quant_shift_ptr = b->quant_shift;
+    short *quant_shift_ptr = b->quant_shift;
    short *qcoeff_ptr      = d->qcoeff;
    short *dqcoeff_ptr     = d->dqcoeff;
    short *dequant_ptr     = d->dequant;
@ -138,8 +138,8 @@ void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
        if (x >= zbin)
        {
            x += round_ptr[rc];
-            y  = (((x * quant_ptr[rc]) >> 16) + x)
-                 >> quant_shift_ptr[rc];             /* quantize (x) */
+            y  = ((((x * quant_ptr[rc]) >> 16) + x)
+                 * quant_shift_ptr[rc]) >> 16;       /* quantize (x) */
            x  = (y ^ sz) - sz;                      /* get the sign back */
            qcoeff_ptr[rc]  = x;                     /* write to destination */
            dqcoeff_ptr[rc] = x * dequant_ptr[rc];   /* dequantized value */
@ -167,7 +167,7 @@ void vp8_strict_quantize_b_c(BLOCK *b, BLOCKD *d)
    int sz;
    short *coeff_ptr;
    short *quant_ptr;
-    unsigned char *quant_shift_ptr;
+    short *quant_shift_ptr;
    short *qcoeff_ptr;
    short *dqcoeff_ptr;
    short *dequant_ptr;
@ -198,7 +198,7 @@ void vp8_strict_quantize_b_c(BLOCK *b, BLOCKD *d)
        if (x >= dq)
        {
            /* Quantize x. */
-            y  = (((x * quant_ptr[rc]) >> 16) + x) >> quant_shift_ptr[rc];
+            y  = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16;
            /* Put the sign back. */
            x = (y + sz) ^ sz;
            /* Save the coefficient and its dequantized value. */
@ -406,7 +406,7 @@ static const int qzbin_factors_y2[129] =
 #define EXACT_QUANT
 #ifdef EXACT_QUANT
 static void invert_quant(int improved_quant, short *quant,
-                               unsigned char *shift, short d)
+                         short *shift, short d)
 {
    if(improved_quant)
    {
@ -418,11 +418,15 @@ static void invert_quant(int improved_quant, short *quant,
        t = 1 + (1<<(16+l))/d;
        *quant = (short)(t - (1<<16));
        *shift = l;
+        /* use multiplication and constant shift by 16 */
+        *shift = 1 << (16 - *shift);
    }
    else
    {
        *quant = (1 << 16) / d;
        *shift = 0;
+        /* use multiplication and constant shift by 16 */
+        *shift = 1 << (16 - *shift);
    }
 }

--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@ -1,245 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%include "vp8_asm_enc_offsets.asm"
-
-
-; void vp8_regular_quantize_b_sse2 | arg
-;  (BLOCK  *b,                     |  0
-;   BLOCKD *d)                     |  1
-
-global sym(vp8_regular_quantize_b_sse2) PRIVATE
-sym(vp8_regular_quantize_b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SAVE_XMM 7
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %if LIBVPX_YASM_WIN64
-    push        rdi
-    push        rsi
-  %endif
-%endif
-
-    ALIGN_STACK 16, rax
-    %define zrun_zbin_boost   0  ;  8
-    %define abs_minus_zbin    8  ; 32
-    %define temp_qcoeff       40 ; 32
-    %define qcoeff            72 ; 32
-    %define stack_size        104
-    sub         rsp, stack_size
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rdx, [rdi + vp8_block_coeff] ; coeff_ptr
-    mov         rcx, [rdi + vp8_block_zbin] ; zbin_ptr
-    movd        xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value
-
-    ; z
-    movdqa      xmm0, [rdx]
-    movdqa      xmm4, [rdx + 16]
-    mov         rdx, [rdi + vp8_block_round] ; round_ptr
-
-    pshuflw     xmm7, xmm7, 0
-    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    ; (z ^ sz)
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-
-    ; x = abs(z)
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    movdqa      xmm2, [rcx]
-    movdqa      xmm3, [rcx + 16]
-    mov         rcx, [rdi + vp8_block_quant] ; quant_ptr
-
-    ; *zbin_ptr + zbin_oq_value
-    paddw       xmm2, xmm7
-    paddw       xmm3, xmm7
-
-    ; x - (*zbin_ptr + zbin_oq_value)
-    psubw       xmm1, xmm2
-    psubw       xmm5, xmm3
-    movdqa      [rsp + abs_minus_zbin], xmm1
-    movdqa      [rsp + abs_minus_zbin + 16], xmm5
-
-    ; add (zbin_ptr + zbin_oq_value) back
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm3
-
-    movdqa      xmm2, [rdx]
-    movdqa      xmm6, [rdx + 16]
-
-    movdqa      xmm3, [rcx]
-    movdqa      xmm7, [rcx + 16]
-
-    ; x + round
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm6
-
-    ; y = x * quant_ptr >> 16
-    pmulhw      xmm3, xmm1
-    pmulhw      xmm7, xmm5
-
-    ; y += x
-    paddw       xmm1, xmm3
-    paddw       xmm5, xmm7
-
-    movdqa      [rsp + temp_qcoeff], xmm1
-    movdqa      [rsp + temp_qcoeff + 16], xmm5
-
-    pxor        xmm6, xmm6
-    ; zero qcoeff
-    movdqa      [rsp + qcoeff], xmm6
-    movdqa      [rsp + qcoeff + 16], xmm6
-
-    mov         rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr
-    mov         rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr
-    mov         [rsp + zrun_zbin_boost], rdx
-
-%macro ZIGZAG_LOOP 1
-    ; x
-    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
-
-    ; if (x >= zbin)
-    sub         cx, WORD PTR[rdx]           ; x - zbin
-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
-    jl          .rq_zigzag_loop_%1           ; x < zbin
-
-    movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
-
-    ; downshift by quant_shift[rc]
-    movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]
-    sar         edi, cl                     ; also sets Z bit
-    je          .rq_zigzag_loop_%1           ; !y
-    mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
-    mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
-.rq_zigzag_loop_%1:
-%endmacro
-; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
-ZIGZAG_LOOP  0
-ZIGZAG_LOOP  1
-ZIGZAG_LOOP  4
-ZIGZAG_LOOP  8
-ZIGZAG_LOOP  5
-ZIGZAG_LOOP  2
-ZIGZAG_LOOP  3
-ZIGZAG_LOOP  6
-ZIGZAG_LOOP  9
-ZIGZAG_LOOP 12
-ZIGZAG_LOOP 13
-ZIGZAG_LOOP 10
-ZIGZAG_LOOP  7
-ZIGZAG_LOOP 11
-ZIGZAG_LOOP 14
-ZIGZAG_LOOP 15
-
-    movdqa      xmm2, [rsp + qcoeff]
-    movdqa      xmm3, [rsp + qcoeff + 16]
-
-    mov         rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr
-    mov         rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr
-
-    ; y ^ sz
-    pxor        xmm2, xmm0
-    pxor        xmm3, xmm4
-    ; x = (y ^ sz) - sz
-    psubw       xmm2, xmm0
-    psubw       xmm3, xmm4
-
-    ; dequant
-    movdqa      xmm0, [rcx]
-    movdqa      xmm1, [rcx + 16]
-
-    mov         rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr
-
-    pmullw      xmm0, xmm2
-    pmullw      xmm1, xmm3
-
-    movdqa      [rcx], xmm2        ; store qcoeff
-    movdqa      [rcx + 16], xmm3
-    movdqa      [rdi], xmm0        ; store dqcoeff
-    movdqa      [rdi + 16], xmm1
-
-    mov         rcx, [rsi + vp8_blockd_eob]
-
-    ; select the last value (in zig_zag order) for EOB
-    pcmpeqw     xmm2, xmm6
-    pcmpeqw     xmm3, xmm6
-    ; !
-    pcmpeqw     xmm6, xmm6
-    pxor        xmm2, xmm6
-    pxor        xmm3, xmm6
-    ; mask inv_zig_zag
-    pand        xmm2, [GLOBAL(inv_zig_zag)]
-    pand        xmm3, [GLOBAL(inv_zig_zag + 16)]
-    ; select the max value
-    pmaxsw      xmm2, xmm3
-    pshufd      xmm3, xmm2, 00001110b
-    pmaxsw      xmm2, xmm3
-    pshuflw     xmm3, xmm2, 00001110b
-    pmaxsw      xmm2, xmm3
-    pshuflw     xmm3, xmm2, 00000001b
-    pmaxsw      xmm2, xmm3
-    movd        eax, xmm2
-    and         eax, 0xff
-
-    mov         BYTE PTR [rcx], al          ; store eob
-
-    ; begin epilog
-    add         rsp, stack_size
-    pop         rsp
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-    RESTORE_GOT
-    RESTORE_XMM
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-inv_zig_zag:
-  dw 0x0001, 0x0002, 0x0006, 0x0007
-  dw 0x0003, 0x0005, 0x0008, 0x000d
-  dw 0x0004, 0x0009, 0x000c, 0x000e
-  dw 0x000a, 0x000b, 0x000f, 0x0010
--- a/vp8/encoder/x86/quantize_sse2.c
+++ b/vp8/encoder/x86/quantize_sse2.c
@ -9,13 +9,140 @@
 */


-#include "vp8/common/blockd.h"
-#include "vp8/common/entropy.h"
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx_ports/x86.h"
+#include "vpx_mem/vpx_mem.h"
 #include "vp8/encoder/block.h"
+#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */

-#include <mmintrin.h> //MMX
-#include <xmmintrin.h> //SSE
-#include <emmintrin.h> //SSE2
+#include <mmintrin.h> /* MMX */
+#include <xmmintrin.h> /* SSE */
+#include <emmintrin.h> /* SSE2 */
+
+#define SELECT_EOB(i, z) \
+    do { \
+        __label__ select_eob_end; \
+        short boost = *zbin_boost_ptr; \
+        int cmp = (x[z] < boost) | (y[z] == 0); \
+        zbin_boost_ptr++; \
+        if (cmp) \
+            goto select_eob_end; \
+        qcoeff_ptr[z] = y[z]; \
+        eob = i; \
+        zbin_boost_ptr = b->zrun_zbin_boost; \
+        select_eob_end:; \
+    } while (0)
+
+void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)
+{
+    char eob = 0;
+    short *zbin_boost_ptr  = b->zrun_zbin_boost;
+    short *qcoeff_ptr      = d->qcoeff;
+    DECLARE_ALIGNED_ARRAY(16, short, x, 16);
+    DECLARE_ALIGNED_ARRAY(16, short, y, 16);
+
+    __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1;
+    __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
+    __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
+    __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
+    __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8));
+    __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra);
+    __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin));
+    __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8));
+    __m128i round0 = _mm_load_si128((__m128i *)(b->round));
+    __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
+    __m128i quant0 = _mm_load_si128((__m128i *)(b->quant));
+    __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));
+    __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
+    __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
+
+    vpx_memset(qcoeff_ptr, 0, 32);
+
+    /* Duplicate to all lanes. */
+    zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
+    zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);
+
+    /* Sign of z: z >> 15 */
+    sz0 = _mm_srai_epi16(z0, 15);
+    sz1 = _mm_srai_epi16(z1, 15);
+
+    /* x = abs(z): (z ^ sz) - sz */
+    x0 = _mm_xor_si128(z0, sz0);
+    x1 = _mm_xor_si128(z1, sz1);
+    x0 = _mm_sub_epi16(x0, sz0);
+    x1 = _mm_sub_epi16(x1, sz1);
+
+    /* zbin[] + zbin_extra */
+    zbin0 = _mm_add_epi16(zbin0, zbin_extra);
+    zbin1 = _mm_add_epi16(zbin1, zbin_extra);
+
+    /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance
+     * the equation because boost is the only value which can change:
+     * x - (zbin[] + extra) >= boost */
+    x_minus_zbin0 = _mm_sub_epi16(x0, zbin0);
+    x_minus_zbin1 = _mm_sub_epi16(x1, zbin1);
+
+    _mm_store_si128((__m128i *)(x), x_minus_zbin0);
+    _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1);
+
+    /* All the remaining calculations are valid whether they are done now with
+     * simd or later inside the loop one at a time. */
+    x0 = _mm_add_epi16(x0, round0);
+    x1 = _mm_add_epi16(x1, round1);
+
+    y0 = _mm_mulhi_epi16(x0, quant0);
+    y1 = _mm_mulhi_epi16(x1, quant1);
+
+    y0 = _mm_add_epi16(y0, x0);
+    y1 = _mm_add_epi16(y1, x1);
+
+    /* Instead of shifting each value independently we convert the scaling
+     * factor with 1 << (16 - shift) so we can use multiply/return high half. */
+    y0 = _mm_mulhi_epi16(y0, quant_shift0);
+    y1 = _mm_mulhi_epi16(y1, quant_shift1);
+
+    /* Return the sign: (y ^ sz) - sz */
+    y0 = _mm_xor_si128(y0, sz0);
+    y1 = _mm_xor_si128(y1, sz1);
+    y0 = _mm_sub_epi16(y0, sz0);
+    y1 = _mm_sub_epi16(y1, sz1);
+
+    _mm_store_si128((__m128i *)(y), y0);
+    _mm_store_si128((__m128i *)(y + 8), y1);
+
+    zbin_boost_ptr = b->zrun_zbin_boost;
+
+    /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
+    SELECT_EOB(1, 0);
+    SELECT_EOB(2, 1);
+    SELECT_EOB(3, 4);
+    SELECT_EOB(4, 8);
+    SELECT_EOB(5, 5);
+    SELECT_EOB(6, 2);
+    SELECT_EOB(7, 3);
+    SELECT_EOB(8, 6);
+    SELECT_EOB(9, 9);
+    SELECT_EOB(10, 12);
+    SELECT_EOB(11, 13);
+    SELECT_EOB(12, 10);
+    SELECT_EOB(13, 7);
+    SELECT_EOB(14, 11);
+    SELECT_EOB(15, 14);
+    SELECT_EOB(16, 15);
+
+    y0 = _mm_load_si128((__m128i *)(d->qcoeff));
+    y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8));
+
+    /* dqcoeff = qcoeff * dequant */
+    y0 = _mm_mullo_epi16(y0, dequant0);
+    y1 = _mm_mullo_epi16(y1, dequant1);
+
+    _mm_store_si128((__m128i *)(d->dqcoeff), y0);
+    _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1);
+
+    *d->eob = eob;
+}

 void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
 {
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@ -90,7 +90,6 @@ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm

 # TODO(johann) make this generic
 ifeq ($(HAVE_SSE2),yes)