Merge changes I267050a5,Iebade0ef,Id96a8df3

* changes: quantize_fp_32x32 highbd ssse3: enable existing function quantize_fp highbd ssse3: use tran_low_t for coeff quantize_fp highbd sse2: use tran_low_t for coeff
2017-02-16 20:34:48 +00:00 · 2017-02-16 20:34:48 +00:00 · cc43012674
commit cc43012674
parent 0bf6b51572 ff37a911ce
4 changed files with 86 additions and 57 deletions
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@ -136,9 +136,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vp9_block_error_fp sse2/;

  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_fp neon/;
+  specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";

  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";

  add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
  specialize qw/vp9_fdct8x8_quant neon ssse3/;
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c
@ -13,14 +13,16 @@

 #include "./vp9_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"

-void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          int skip_block, const int16_t *zbin_ptr,
                          const int16_t *round_ptr, const int16_t *quant_ptr,
-                          const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
-                          int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan_ptr,
-                          const int16_t *iscan_ptr) {
+                          const int16_t *quant_shift_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan_ptr, const int16_t *iscan_ptr) {
  __m128i zero;
  __m128i thr;
  int16_t nzflag;
@ -53,8 +55,8 @@ void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
        __m128i qcoeff0, qcoeff1;
        __m128i qtmp0, qtmp1;
        // Do DC and first 15 AC
-        coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs));
-        coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1);
+        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);

        // Poor man's sign extract
        coeff0_sign = _mm_srai_epi16(coeff0, 15);
@ -77,15 +79,15 @@ void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);

        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
        dequant = _mm_unpackhi_epi64(dequant, dequant);
        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
      }

      {
@ -120,8 +122,8 @@ void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
        __m128i qcoeff0, qcoeff1;
        __m128i qtmp0, qtmp1;

-        coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs));
-        coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1);
+        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);

        // Poor man's sign extract
        coeff0_sign = _mm_srai_epi16(coeff0, 15);
@ -146,20 +148,20 @@ void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+          store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+          store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);

          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+          store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+          store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
        } else {
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+          store_zero_tran_low(qcoeff_ptr + n_coeffs);
+          store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);

-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
+          store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+          store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
        }
      }

@ -199,10 +201,11 @@ void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
    }
  } else {
    do {
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+      store_zero_tran_low(qcoeff_ptr + n_coeffs);
+      store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
+
+      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+      store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
      n_coeffs += 8 * 2;
    } while (n_coeffs < 0);
    *eob_ptr = 0;
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@ -11,6 +11,7 @@
 %define private_prefix vp9

 %include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"

 SECTION_RODATA
 pw_1: times 8 dw 1
@ -48,15 +49,15 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
 %endif
  pxor                            m5, m5                   ; m5 = dedicated zero

-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                            r5q, [  r5q+ncoeffq*2]
-  lea                            r3q, [ r3q+ncoeffq*2]
-  lea                            r4q, [r4q+ncoeffq*2]
+  INCREMENT_ELEMENTS_TRAN_LOW coeffq, ncoeffq
+  lea                            r5q, [r5q+ncoeffq*2]
+  INCREMENT_ELEMENTS_TRAN_LOW    r3q, ncoeffq
+  INCREMENT_ELEMENTS_TRAN_LOW    r4q, ncoeffq
  neg                        ncoeffq

  ; get DC and first 15 AC coeffs
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  LOAD_TRAN_LOW  9, coeffq, ncoeffq                        ; m9 = c[i]
+  LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8                    ; m10 = c[i]
  pabsw                           m6, m9                   ; m6 = abs(m9)
  pabsw                          m11, m10                  ; m11 = abs(m10)
  pcmpeqw                         m7, m7
@ -69,8 +70,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
  psignw                          m8, m9                   ; m8 = reinsert sign
  psignw                         m13, m10                  ; m13 = reinsert sign
-  mova            [r3q+ncoeffq*2+ 0], m8
-  mova            [r3q+ncoeffq*2+16], m13
+  STORE_TRAN_LOW  8, r3q, ncoeffq,     6, 11, 12
+  STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12
 %ifidn %1, fp_32x32
  pabsw                           m8, m8
  pabsw                          m13, m13
@ -87,8 +88,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
 %else
  psrlw                           m0, m3, 1
 %endif
-  mova            [r4q+ncoeffq*2+ 0], m8
-  mova            [r4q+ncoeffq*2+16], m13
+  STORE_TRAN_LOW  8, r4q, ncoeffq,     6, 11, 12
+  STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12
  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
@ -102,8 +103,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
  jz .accumulate_eob

 .ac_only_loop:
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  LOAD_TRAN_LOW  9, coeffq, ncoeffq                        ; m9 = c[i]
+  LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8                    ; m10 = c[i]
  pabsw                           m6, m9                   ; m6 = abs(m9)
  pabsw                          m11, m10                  ; m11 = abs(m10)

@ -123,8 +124,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
  psignw                         m14, m9                   ; m14 = reinsert sign
  psignw                         m13, m10                  ; m13 = reinsert sign
-  mova            [r3q+ncoeffq*2+ 0], m14
-  mova            [r3q+ncoeffq*2+16], m13
+  STORE_TRAN_LOW 14, r3q, ncoeffq,     6, 11, 12
+  STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12
 %ifidn %1, fp_32x32
  pabsw                          m14, m14
  pabsw                          m13, m13
@ -137,8 +138,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
  psignw                         m14, m9
  psignw                         m13, m10
 %endif
-  mova            [r4q+ncoeffq*2+ 0], m14
-  mova            [r4q+ncoeffq*2+16], m13
+  STORE_TRAN_LOW 14, r4q, ncoeffq,     6, 11, 12
+  STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12
  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
@ -154,10 +155,10 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \

  jmp .accumulate_eob
 .skip_iter:
-  mova            [r3q+ncoeffq*2+ 0], m5
-  mova            [r3q+ncoeffq*2+16], m5
-  mova            [r4q+ncoeffq*2+ 0], m5
-  mova            [r4q+ncoeffq*2+16], m5
+  STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq
+  STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq + 8
+  STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq
+  STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq + 8
  add                        ncoeffq, mmsize
  jl .ac_only_loop

@ -186,10 +187,10 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
  neg                        ncoeffq
  pxor                            m7, m7
 .blank_loop:
-  mova            [r0q+ncoeffq*2+ 0], m7
-  mova            [r0q+ncoeffq*2+16], m7
-  mova            [r2q+ncoeffq*2+ 0], m7
-  mova            [r2q+ncoeffq*2+16], m7
+  STORE_ZERO_TRAN_LOW 7, r0q, ncoeffq
+  STORE_ZERO_TRAN_LOW 7, r0q, ncoeffq + 8
+  STORE_ZERO_TRAN_LOW 7, r2q, ncoeffq
+  STORE_ZERO_TRAN_LOW 7, r2q, ncoeffq + 8
  add                        ncoeffq, mmsize
  jl .blank_loop
  mov                     word [r3q], 0
--- a/vpx_dsp/x86/bitdepth_conversion_sse2.asm
+++ b/vpx_dsp/x86/bitdepth_conversion_sse2.asm
@ -38,29 +38,53 @@
 ; the values down to 16 bits.
 %macro LOAD_TRAN_LOW 3
 %if CONFIG_VP9_HIGHBITDEPTH
-  mova     m%1, [%2 + %3 * 4]
-  packssdw m%1, [%2 + %3 * 4 + 16]
+  mova     m%1, [%2 + (%3) * 4]
+  packssdw m%1, [%2 + (%3) * 4 + 16]
 %else
-  mova     m%1, [%2 + %3 * 2]
+  mova     m%1, [%2 + (%3) * 2]
 %endif
 %endmacro

 ; Store m%1 to %2 + %3.
 ; %3 is the offset in elements, not bytes.
+; If 5 arguments are provided then m%1 is corrupted.
+; If 6 arguments are provided then m%1 is preserved.
 ; If tran_low_t is 16 bits (low bit depth configuration) then store the value
 ; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign
 ; extend the values first.
 ; Uses m%4-m%6 as scratch registers for high bit depth.
-%macro STORE_TRAN_LOW 5
+%macro STORE_TRAN_LOW 5-6
 %if CONFIG_VP9_HIGHBITDEPTH
  pxor                      m%4, m%4
  mova                      m%5, m%1
+  %if %0 == 6
+  mova                      m%6, m%1
+  %endif
  pcmpgtw                   m%4, m%1
  punpcklwd                 m%5, m%4
+  %if %0 == 5
  punpckhwd                 m%1, m%4
-  mova       [%2 + %3 * 4 +  0], m%5
-  mova       [%2 + %3 * 4 + 16], m%1
+  %else
+  punpckhwd                 m%6, m%4
+  %endif
+  mova     [%2 + (%3) * 4 +  0], m%5
+  %if %0 == 5
+  mova     [%2 + (%3) * 4 + 16], m%1
+  %else
+  mova     [%2 + (%3) * 4 + 16], m%6
+  %endif
 %else
-  mova            [%2 + %3 * 2], m%1
+  mova          [%2 + (%3) * 2], m%1
+%endif
+%endmacro
+
+; Store zeros (in m%1) to %2 + %3.
+; %3 is the offset in elements, not bytes.
+%macro STORE_ZERO_TRAN_LOW 3
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova     [%2 + (%3) * 4 +  0], m%1
+  mova     [%2 + (%3) * 4 + 16], m%1
+%else
+  mova          [%2 + (%3) * 2], m%1
 %endif
 %endmacro