Change eob threshold for partial inverse 8x8 2D-DCT to 12

The scanning order has the first 12 coefficients of the 8x8 2D-DCT sitting in the top left 4x4 block. Hence the partial inverse 8x8 2D-DCT allows to handle cases with eob below 12. The overall runtime of the inverse 8x8 2D-DCT unit is reduced from 166 cycles (using SSE2) to 150 cycles (using SSSE3). Change-Id: I4514f9748042809ac84df4c14382c00f313f1cd2
2014-05-08 09:42:26 -07:00
parent 9e7b09bc5d
commit 41a350a83d
7 changed files with 18 additions and 18 deletions
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -132,8 +132,8 @@ INSTANTIATE_TEST_CASE_P(
                   &vp9_idct16x16_1_add_c,
                   TX_16X16, 1),
        make_tuple(&vp9_idct8x8_64_add_c,
-                   &vp9_idct8x8_10_add_c,
-                   TX_8X8, 10),
+                   &vp9_idct8x8_12_add_c,
+                   TX_8X8, 12),
        make_tuple(&vp9_idct8x8_64_add_c,
                   &vp9_idct8x8_1_add_c,
                   TX_8X8, 1),
@@ -154,8 +154,8 @@ INSTANTIATE_TEST_CASE_P(
                   &vp9_idct16x16_1_add_neon,
                   TX_16X16, 1),
        make_tuple(&vp9_idct8x8_64_add_c,
-                   &vp9_idct8x8_10_add_neon,
-                   TX_8X8, 10),
+                   &vp9_idct8x8_12_add_neon,
+                   TX_8X8, 12),
        make_tuple(&vp9_idct8x8_64_add_c,
                   &vp9_idct8x8_1_add_neon,
                   TX_8X8, 1),
@@ -181,8 +181,8 @@ INSTANTIATE_TEST_CASE_P(
                   &vp9_idct16x16_1_add_sse2,
                   TX_16X16, 1),
        make_tuple(&vp9_idct8x8_64_add_c,
-                   &vp9_idct8x8_10_add_sse2,
-                   TX_8X8, 10),
+                   &vp9_idct8x8_12_add_sse2,
+                   TX_8X8, 12),
        make_tuple(&vp9_idct8x8_64_add_c,
                   &vp9_idct8x8_1_add_sse2,
                   TX_8X8, 1),
--- a/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm
@@ -9,7 +9,7 @@
 ;

    EXPORT  |vp9_idct8x8_64_add_neon|
-    EXPORT  |vp9_idct8x8_10_add_neon|
+    EXPORT  |vp9_idct8x8_12_add_neon|
    ARM
    REQUIRE8
    PRESERVE8
@@ -310,13 +310,13 @@
    bx              lr
    ENDP  ; |vp9_idct8x8_64_add_neon|

-;void vp9_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vp9_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)

-|vp9_idct8x8_10_add_neon| PROC
+|vp9_idct8x8_12_add_neon| PROC
    push            {r4-r9}
    vpush           {d8-d15}
    vld1.s16        {q8,q9}, [r0]!
@@ -514,6 +514,6 @@
    vpop            {d8-d15}
    pop             {r4-r9}
    bx              lr
-    ENDP  ; |vp9_idct8x8_10_add_neon|
+    ENDP  ; |vp9_idct8x8_12_add_neon|

    END
--- a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
@@ -617,7 +617,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
  }
 }

-void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest,
+void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
                              int dest_stride) {
  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
  int16_t *outptr = out;
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -421,7 +421,7 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
  }
 }

-void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) {
  int16_t out[8 * 8] = { 0 };
  int16_t *outptr = out;
  int i, j;
@@ -1348,8 +1348,8 @@ void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
  if (eob == 1)
    // DC only DCT coefficient
    vp9_idct8x8_1_add(input, dest, stride);
-  else if (eob <= 10)
-    vp9_idct8x8_10_add(input, dest, stride);
+  else if (eob <= 12)
+    vp9_idct8x8_12_add(input, dest, stride);
  else
    vp9_idct8x8_64_add(input, dest, stride);
 }
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -312,8 +312,8 @@ specialize qw/vp9_idct8x8_1_add sse2 neon dspr2/;
 add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
 specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/, "$ssse3_x86_64";

-add_proto qw/void vp9_idct8x8_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct8x8_10_add sse2 neon dspr2/, "$ssse3_x86_64";
+add_proto qw/void vp9_idct8x8_12_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct8x8_12_add sse2 neon dspr2/, "$ssse3_x86_64";

 add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
 specialize qw/vp9_idct16x16_1_add sse2 neon dspr2/;
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -995,7 +995,7 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
  RECON_AND_STORE(dest, in[7]);
 }

-void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
  const __m128i final_rounding = _mm_set1_epi16(1<<4);
--- a/vp9/common/x86/vp9_idct_ssse3.asm
+++ b/vp9/common/x86/vp9_idct_ssse3.asm
@@ -185,7 +185,7 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
  RET

 ; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
-cglobal idct8x8_10_add, 3, 5, 13, input, output, stride
+cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
  mova       m8, [pd_8192]
  mova      m11, [pw_16]
  mova      m12, [pw_11585x2]