Merge "Support rectangular tx_size in the common lib" into nextgenv2

2016-11-02 21:49:41 +00:00
parent e714e70f77 9fe31390ca
commit c104b8f269
3 changed files with 29 additions and 33 deletions
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -389,25 +389,23 @@ if (aom_config("CONFIG_TX64X64") eq "yes") {
  specialize qw/av1_fht64x64/;
 }

-if (aom_config("CONFIG_EXT_TX") eq "yes") {
-  add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/av1_fht4x8 sse2/;
+add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht4x8 sse2/;

-  add_proto qw/void av1_fht8x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/av1_fht8x4 sse2/;
+add_proto qw/void av1_fht8x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht8x4 sse2/;

-  add_proto qw/void av1_fht8x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/av1_fht8x16 sse2/;
+add_proto qw/void av1_fht8x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht8x16 sse2/;

-  add_proto qw/void av1_fht16x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/av1_fht16x8 sse2/;
+add_proto qw/void av1_fht16x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht16x8 sse2/;

-  add_proto qw/void av1_fht16x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/av1_fht16x32 sse2/;
+add_proto qw/void av1_fht16x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht16x32 sse2/;

-  add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/av1_fht32x16 sse2/;
-}
+add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht32x16 sse2/;

 if (aom_config("CONFIG_AOM_HIGHBITDEPTH") ne "yes") {
  if (aom_config("CONFIG_EXT_TX") ne "yes") {
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@@ -55,7 +55,6 @@ static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
  }
 }

-#if CONFIG_EXT_TX
 static void fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
                         int diff_stride, TX_TYPE tx_type,
                         FWD_TXFM_OPT fwd_txfm_opt) {
@@ -97,7 +96,6 @@ static void fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
  (void)fwd_txfm_opt;
  av1_fht32x16(src_diff, coeff, diff_stride, tx_type);
 }
-#endif  // CONFIG_EXT_TX

 static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
                         int diff_stride, TX_TYPE tx_type,
@@ -233,7 +231,6 @@ static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
  }
 }

-#if CONFIG_EXT_TX
 static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
                                int diff_stride, TX_TYPE tx_type,
                                FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
@@ -281,7 +278,6 @@ static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
  (void)bd;
  av1_highbd_fht32x16(src_diff, coeff, diff_stride, tx_type);
 }
-#endif  // CONFIG_EXT_TX

 static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
                                int diff_stride, TX_TYPE tx_type,
@@ -403,7 +399,6 @@ void fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
    case TX_8X8:
      fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
      break;
-#if CONFIG_EXT_TX
    case TX_4X8:
      fwd_txfm_4x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
      break;
@@ -422,7 +417,6 @@ void fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
    case TX_32X16:
      fwd_txfm_32x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
      break;
-#endif  // CONFIG_EXT_TX
    case TX_4X4:
      fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless);
      break;
@@ -452,7 +446,6 @@ void highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
      highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
                          bd);
      break;
-#if CONFIG_EXT_TX
    case TX_4X8:
      highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
                          bd);
@@ -477,7 +470,6 @@ void highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
      highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
                            bd);
      break;
-#endif  // CONFIG_EXT_TX
    case TX_4X4:
      highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless, bd);
      break;
--- a/av1/encoder/x86/dct_intrin_sse2.c
+++ b/av1/encoder/x86/dct_intrin_sse2.c
@@ -2592,7 +2592,6 @@ void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
  }
 }

-#if CONFIG_EXT_TX
 static INLINE void scale_sqrt2_8x4(__m128i *in) {
  // Implements fdct_round_shift(input * Sqrt2), which is equivalent to
  // ROUND_POWER_OF_TWO(input * Sqrt2, DCT_CONST_BITS),
@@ -2767,9 +2766,9 @@ void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride,
                     int tx_type) {
  __m128i in[8];

+  load_buffer_4x8(input, in, stride, 0, 0);
  switch (tx_type) {
    case DCT_DCT:
-      load_buffer_4x8(input, in, stride, 0, 0);
      fdct8_sse2(in);
      // Repack data into two 4x4 blocks so we can reuse the 4x4 transforms
      // The other cases (and the 8x4 transforms) all behave similarly
@@ -2781,7 +2780,6 @@ void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride,
      fdct4_sse2(in + 4);
      break;
    case ADST_DCT:
-      load_buffer_4x8(input, in, stride, 0, 0);
      fadst8_sse2(in);
      in[4] = _mm_shuffle_epi32(in[0], 0xe);
      in[5] = _mm_shuffle_epi32(in[1], 0xe);
@@ -2791,7 +2789,6 @@ void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride,
      fdct4_sse2(in + 4);
      break;
    case DCT_ADST:
-      load_buffer_4x8(input, in, stride, 0, 0);
      fdct8_sse2(in);
      in[4] = _mm_shuffle_epi32(in[0], 0xe);
      in[5] = _mm_shuffle_epi32(in[1], 0xe);
@@ -2801,7 +2798,6 @@ void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride,
      fadst4_sse2(in + 4);
      break;
    case ADST_ADST:
-      load_buffer_4x8(input, in, stride, 0, 0);
      fadst8_sse2(in);
      in[4] = _mm_shuffle_epi32(in[0], 0xe);
      in[5] = _mm_shuffle_epi32(in[1], 0xe);
@@ -2810,6 +2806,7 @@ void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride,
      fadst4_sse2(in);
      fadst4_sse2(in + 4);
      break;
+#if CONFIG_EXT_TX
    case FLIPADST_DCT:
      load_buffer_4x8(input, in, stride, 1, 0);
      fadst8_sse2(in);
@@ -2930,6 +2927,7 @@ void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride,
      fadst4_sse2(in);
      fadst4_sse2(in + 4);
      break;
+#endif
    default: assert(0); break;
  }
  write_buffer_4x8(output, in);
@@ -3023,6 +3021,7 @@ void av1_fht8x4_sse2(const int16_t *input, tran_low_t *output, int stride,
      fadst4_sse2(in + 4);
      fadst8_sse2(in);
      break;
+#if CONFIG_EXT_TX
    case FLIPADST_DCT:
      load_buffer_8x4(input, in, stride, 1, 0);
      fadst4_sse2(in);
@@ -3095,6 +3094,7 @@ void av1_fht8x4_sse2(const int16_t *input, tran_low_t *output, int stride,
      fidtx4_sse2(in + 4);
      fadst8_sse2(in);
      break;
+#endif
    default: assert(0); break;
  }
  write_buffer_8x4(output, in);
@@ -3158,6 +3158,7 @@ void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
      fadst8_sse2(t);
      fadst8_sse2(b);
      break;
+#if CONFIG_EXT_TX
    case FLIPADST_DCT:
      load_buffer_8x16(input, in, stride, 1, 0);
      fadst16_8col(in);
@@ -3254,6 +3255,7 @@ void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
      fadst8_sse2(t);
      fadst8_sse2(b);
      break;
+#endif
    default: assert(0); break;
  }
  right_shift_8x8(t, 2);
@@ -3314,6 +3316,7 @@ void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride,
      fadst8_sse2(r);
      fadst16_8col(in);
      break;
+#if CONFIG_EXT_TX
    case FLIPADST_DCT:
      load_buffer_16x8(input, in, stride, 1, 0);
      fadst8_sse2(l);
@@ -3386,6 +3389,7 @@ void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride,
      fidtx8_sse2(r);
      fadst16_8col(in);
      break;
+#endif
    default: assert(0); break;
  }
  array_transpose_8x8(l, l);
@@ -3436,6 +3440,7 @@ static INLINE void fhalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
  fdct16_sse2(tl, tr);
 }

+#if CONFIG_EXT_TX
 static INLINE void fidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
                                 __m128i *br) {
  int i;
@@ -3448,6 +3453,7 @@ static INLINE void fidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
  array_transpose_16x16(tl, tr);
  array_transpose_16x16(bl, br);
 }
+#endif

 static INLINE void load_buffer_16x32(const int16_t *input, __m128i *intl,
                                     __m128i *intr, __m128i *inbl,
@@ -3507,7 +3513,7 @@ static INLINE void write_buffer_16x32(tran_low_t *output, __m128i *restl,
  }
 }

-// Note on data layout, for both this and the 32x16 tranforms:
+// Note on data layout, for both this and the 32x16 transforms:
 // So that we can reuse the 16-element transforms easily,
 // we want to split the input into 8x16 blocks.
 // For 16x32, this means the input is a 2x2 grid of such blocks.
@@ -3541,6 +3547,7 @@ void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride,
      fadst16_sse2(intl, intr);
      fadst16_sse2(inbl, inbr);
      break;
+#if CONFIG_EXT_TX
    case FLIPADST_DCT:
      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
      fhalfright32_16col(intl, intr, inbl, inbr);
@@ -3613,6 +3620,7 @@ void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride,
      fadst16_sse2(intl, intr);
      fadst16_sse2(inbl, inbr);
      break;
+#endif
    default: assert(0); break;
  }
  write_buffer_16x32(output, intl, intr, inbl, inbr);
@@ -3671,31 +3679,29 @@ void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride,
                       int tx_type) {
  __m128i in0[16], in1[16], in2[16], in3[16];

+  load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
  switch (tx_type) {
    case DCT_DCT:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
      fdct16_sse2(in0, in1);
      fdct16_sse2(in2, in3);
      fdct32_16col(in0, in1, in2, in3);
      break;
    case ADST_DCT:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
      fadst16_sse2(in0, in1);
      fadst16_sse2(in2, in3);
      fdct32_16col(in0, in1, in2, in3);
      break;
    case DCT_ADST:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
      fdct16_sse2(in0, in1);
      fdct16_sse2(in2, in3);
      fhalfright32_16col(in0, in1, in2, in3);
      break;
    case ADST_ADST:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
      fadst16_sse2(in0, in1);
      fadst16_sse2(in2, in3);
      fhalfright32_16col(in0, in1, in2, in3);
      break;
+#if CONFIG_EXT_TX
    case FLIPADST_DCT:
      load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
      fadst16_sse2(in0, in1);
@@ -3768,8 +3774,8 @@ void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride,
      fidtx16_sse2(in2, in3);
      fhalfright32_16col(in0, in1, in2, in3);
      break;
+#endif
    default: assert(0); break;
  }
  write_buffer_32x16(output, in0, in1, in2, in3);
 }
-#endif  // CONFIG_EXT_TX