Add SSE2 versions of av1_fht8x16 and av1_fht16x8
Encoder speedup ~2% with ext-tx + rect-tx Change-Id: Id56ddf102a887de31d181bde6d8ef8c4f03da945
This commit is contained in:
		
				
					committed by
					
						
						Debargha Mukherjee
					
				
			
			
				
	
			
			
			
						parent
						
							e51ee021dc
						
					
				
				
					commit
					1a800f6539
				
			@@ -68,13 +68,13 @@ static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
 | 
			
		||||
  const __m128i v_bias_d = _mm_set1_epi32(1 << (bits - 1));
 | 
			
		||||
  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
 | 
			
		||||
  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
 | 
			
		||||
  return _mm_srli_epi32(v_tmp_d, bits);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
 | 
			
		||||
  const __m128i v_bias_d = _mm_set1_epi32(1 << (bits - 1));
 | 
			
		||||
  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
 | 
			
		||||
  const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
 | 
			
		||||
  const __m128i v_tmp_d =
 | 
			
		||||
      _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
 | 
			
		||||
 
 | 
			
		||||
@@ -51,7 +51,7 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# dct
 | 
			
		||||
# Inverse dct
 | 
			
		||||
#
 | 
			
		||||
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
 | 
			
		||||
  # Note as optimized versions of these functions are added we need to add a check to ensure
 | 
			
		||||
@@ -368,10 +368,22 @@ if (aom_config("CONFIG_AOM_QM") eq "yes") {
 | 
			
		||||
 | 
			
		||||
# fdct functions
 | 
			
		||||
 | 
			
		||||
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
 | 
			
		||||
  add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
  specialize qw/av1_fht4x4 sse2/;
 | 
			
		||||
add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
specialize qw/av1_fht4x4 sse2/;
 | 
			
		||||
 | 
			
		||||
add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
specialize qw/av1_fwht4x4/;
 | 
			
		||||
 | 
			
		||||
add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
specialize qw/av1_fht8x8 sse2/;
 | 
			
		||||
 | 
			
		||||
add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
specialize qw/av1_fht16x16 sse2/;
 | 
			
		||||
 | 
			
		||||
add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
specialize qw/av1_fht32x32/;
 | 
			
		||||
 | 
			
		||||
if (aom_config("CONFIG_EXT_TX") eq "yes") {
 | 
			
		||||
  add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
  specialize qw/av1_fht4x8/;
 | 
			
		||||
 | 
			
		||||
@@ -379,29 +391,19 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
 | 
			
		||||
  specialize qw/av1_fht8x4/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fht8x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
  specialize qw/av1_fht8x16/;
 | 
			
		||||
  specialize qw/av1_fht8x16 sse2/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fht16x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
  specialize qw/av1_fht16x8/;
 | 
			
		||||
  specialize qw/av1_fht16x8 sse2/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fht16x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
  specialize qw/av1_fht16x32/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
  specialize qw/av1_fht32x16/;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
  specialize qw/av1_fht8x8 sse2/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
  specialize qw/av1_fht16x16 sse2/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
  specialize qw/av1_fht32x32/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
  specialize qw/av1_fwht4x4/;
 | 
			
		||||
  if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
 | 
			
		||||
if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
 | 
			
		||||
  add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
  specialize qw/av1_fdct4x4/;
 | 
			
		||||
 | 
			
		||||
@@ -428,7 +430,45 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
  specialize qw/av1_fdct32x32_1/;
 | 
			
		||||
} else {
 | 
			
		||||
  add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
  specialize qw/av1_fdct4x4 sse2/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
  specialize qw/av1_fdct4x4_1 sse2/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
  specialize qw/av1_fdct8x8 sse2/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
  specialize qw/av1_fdct8x8_1 sse2/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
  specialize qw/av1_fdct16x16 sse2/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
  specialize qw/av1_fdct16x16_1 sse2/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
  specialize qw/av1_fdct32x32 sse2/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
  specialize qw/av1_fdct32x32_rd sse2/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
  specialize qw/av1_fdct32x32_1 sse2/;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") ne "yes") {
 | 
			
		||||
  if (aom_config("CONFIG_EXT_TX") ne "yes") {
 | 
			
		||||
    specialize qw/av1_fht4x4 msa/;
 | 
			
		||||
    specialize qw/av1_fht8x8 msa/;
 | 
			
		||||
    specialize qw/av1_fht16x16 msa/;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
 | 
			
		||||
  if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
 | 
			
		||||
    add_proto qw/void av1_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_highbd_fdct4x4/;
 | 
			
		||||
 | 
			
		||||
@@ -453,33 +493,6 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
 | 
			
		||||
    add_proto qw/void av1_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_highbd_fdct32x32_1/;
 | 
			
		||||
  } else {
 | 
			
		||||
    add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct4x4 sse2/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct4x4_1 sse2/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct8x8 sse2/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct8x8_1 sse2/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct16x16 sse2/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct16x16_1 sse2/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct32x32 sse2/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct32x32_rd sse2/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct32x32_1 sse2/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_highbd_fdct4x4 sse2/;
 | 
			
		||||
 | 
			
		||||
@@ -504,100 +517,6 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
 | 
			
		||||
    add_proto qw/void av1_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_highbd_fdct32x32_1/;
 | 
			
		||||
  }
 | 
			
		||||
} else {
 | 
			
		||||
  add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
  specialize qw/av1_fht4x4 sse2/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
  specialize qw/av1_fht4x8/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fht8x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
  specialize qw/av1_fht8x4/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fht8x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
  specialize qw/av1_fht8x16/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fht16x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
  specialize qw/av1_fht16x8/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fht16x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
  specialize qw/av1_fht16x32/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
  specialize qw/av1_fht32x16/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
  specialize qw/av1_fht8x8 sse2/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
  specialize qw/av1_fht16x16 sse2/;
 | 
			
		||||
 | 
			
		||||
  if (aom_config("CONFIG_EXT_TX") ne "yes") {
 | 
			
		||||
    specialize qw/av1_fht4x4 msa/;
 | 
			
		||||
    specialize qw/av1_fht8x8 msa/;
 | 
			
		||||
    specialize qw/av1_fht16x16 msa/;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 | 
			
		||||
  specialize qw/av1_fht32x32/;
 | 
			
		||||
 | 
			
		||||
  add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
  specialize qw/av1_fwht4x4/;
 | 
			
		||||
  if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
 | 
			
		||||
    add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct4x4/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct4x4_1/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct8x8/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct8x8_1/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct16x16/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct16x16_1/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct32x32/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct32x32_rd/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct32x32_1/;
 | 
			
		||||
  } else {
 | 
			
		||||
    add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct4x4 sse2/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct4x4_1 sse2/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct8x8 sse2/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct8x8_1 sse2/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct16x16 sse2/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct16x16_1 sse2/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct32x32 sse2/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct32x32_rd sse2/;
 | 
			
		||||
 | 
			
		||||
    add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
 | 
			
		||||
    specialize qw/av1_fdct32x32_1 sse2/;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
add_proto qw/void av1_fwd_idtx/, "const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type";
 | 
			
		||||
 
 | 
			
		||||
@@ -1311,8 +1311,8 @@ void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
 | 
			
		||||
  // Columns
 | 
			
		||||
  for (i = 0; i < n; ++i) {
 | 
			
		||||
    for (j = 0; j < n2; ++j)
 | 
			
		||||
      temp_in[j] =
 | 
			
		||||
          (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
 | 
			
		||||
      temp_in[j] = ROUND_POWER_OF_TWO_SIGNED(input[j * stride + i] * 4 * Sqrt2,
 | 
			
		||||
                                             DCT_CONST_BITS);
 | 
			
		||||
    ht.cols(temp_in, temp_out);
 | 
			
		||||
    for (j = 0; j < n2; ++j) out[j * n + i] = temp_out[j];
 | 
			
		||||
  }
 | 
			
		||||
@@ -1321,7 +1321,8 @@ void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
 | 
			
		||||
  for (i = 0; i < n2; ++i) {
 | 
			
		||||
    for (j = 0; j < n; ++j) temp_in[j] = out[j + i * n];
 | 
			
		||||
    ht.rows(temp_in, temp_out);
 | 
			
		||||
    for (j = 0; j < n; ++j) output[j + i * n] = (temp_out[j] + 1) >> 2;
 | 
			
		||||
    for (j = 0; j < n; ++j)
 | 
			
		||||
      output[j + i * n] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
 | 
			
		||||
  }
 | 
			
		||||
  // Note: overall scale factor of transform is 8 times unitary
 | 
			
		||||
}
 | 
			
		||||
@@ -1358,8 +1359,8 @@ void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
 | 
			
		||||
  // Columns
 | 
			
		||||
  for (i = 0; i < n2; ++i) {
 | 
			
		||||
    for (j = 0; j < n; ++j)
 | 
			
		||||
      temp_in[j] =
 | 
			
		||||
          (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
 | 
			
		||||
      temp_in[j] = ROUND_POWER_OF_TWO_SIGNED(input[j * stride + i] * 4 * Sqrt2,
 | 
			
		||||
                                             DCT_CONST_BITS);
 | 
			
		||||
    ht.cols(temp_in, temp_out);
 | 
			
		||||
    for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
 | 
			
		||||
  }
 | 
			
		||||
@@ -1368,7 +1369,8 @@ void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
 | 
			
		||||
  for (i = 0; i < n; ++i) {
 | 
			
		||||
    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
 | 
			
		||||
    ht.rows(temp_in, temp_out);
 | 
			
		||||
    for (j = 0; j < n2; ++j) output[j + i * n2] = (temp_out[j] + 1) >> 2;
 | 
			
		||||
    for (j = 0; j < n2; ++j)
 | 
			
		||||
      output[j + i * n2] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
 | 
			
		||||
  }
 | 
			
		||||
  // Note: overall scale factor of transform is 8 times unitary
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -12,10 +12,11 @@
 | 
			
		||||
#include <assert.h>
 | 
			
		||||
#include <emmintrin.h>  // SSE2
 | 
			
		||||
 | 
			
		||||
#include "./av1_rtcd.h"
 | 
			
		||||
#include "./aom_dsp_rtcd.h"
 | 
			
		||||
#include "./av1_rtcd.h"
 | 
			
		||||
#include "aom_dsp/txfm_common.h"
 | 
			
		||||
#include "aom_dsp/x86/fwd_txfm_sse2.h"
 | 
			
		||||
#include "aom_dsp/x86/synonyms.h"
 | 
			
		||||
#include "aom_dsp/x86/txfm_common_sse2.h"
 | 
			
		||||
#include "aom_ports/mem.h"
 | 
			
		||||
 | 
			
		||||
@@ -2584,3 +2585,362 @@ void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
 | 
			
		||||
    default: assert(0); break;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#if CONFIG_EXT_TX
 | 
			
		||||
static INLINE void scale_sqrt2_8x8(__m128i *in) {
 | 
			
		||||
  // Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)'
 | 
			
		||||
  // for each element
 | 
			
		||||
  const __m128i v_scale_w = _mm_set1_epi16(Sqrt2);
 | 
			
		||||
 | 
			
		||||
  const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
 | 
			
		||||
  const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
 | 
			
		||||
  const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
 | 
			
		||||
  const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
 | 
			
		||||
  const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
 | 
			
		||||
  const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
 | 
			
		||||
  const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
 | 
			
		||||
  const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
 | 
			
		||||
  const __m128i v_p4l_w = _mm_mullo_epi16(in[4], v_scale_w);
 | 
			
		||||
  const __m128i v_p4h_w = _mm_mulhi_epi16(in[4], v_scale_w);
 | 
			
		||||
  const __m128i v_p5l_w = _mm_mullo_epi16(in[5], v_scale_w);
 | 
			
		||||
  const __m128i v_p5h_w = _mm_mulhi_epi16(in[5], v_scale_w);
 | 
			
		||||
  const __m128i v_p6l_w = _mm_mullo_epi16(in[6], v_scale_w);
 | 
			
		||||
  const __m128i v_p6h_w = _mm_mulhi_epi16(in[6], v_scale_w);
 | 
			
		||||
  const __m128i v_p7l_w = _mm_mullo_epi16(in[7], v_scale_w);
 | 
			
		||||
  const __m128i v_p7h_w = _mm_mulhi_epi16(in[7], v_scale_w);
 | 
			
		||||
 | 
			
		||||
  const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
 | 
			
		||||
  const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
 | 
			
		||||
  const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
 | 
			
		||||
  const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
 | 
			
		||||
  const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
 | 
			
		||||
  const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
 | 
			
		||||
  const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
 | 
			
		||||
  const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
 | 
			
		||||
  const __m128i v_p4a_d = _mm_unpacklo_epi16(v_p4l_w, v_p4h_w);
 | 
			
		||||
  const __m128i v_p4b_d = _mm_unpackhi_epi16(v_p4l_w, v_p4h_w);
 | 
			
		||||
  const __m128i v_p5a_d = _mm_unpacklo_epi16(v_p5l_w, v_p5h_w);
 | 
			
		||||
  const __m128i v_p5b_d = _mm_unpackhi_epi16(v_p5l_w, v_p5h_w);
 | 
			
		||||
  const __m128i v_p6a_d = _mm_unpacklo_epi16(v_p6l_w, v_p6h_w);
 | 
			
		||||
  const __m128i v_p6b_d = _mm_unpackhi_epi16(v_p6l_w, v_p6h_w);
 | 
			
		||||
  const __m128i v_p7a_d = _mm_unpacklo_epi16(v_p7l_w, v_p7h_w);
 | 
			
		||||
  const __m128i v_p7b_d = _mm_unpackhi_epi16(v_p7l_w, v_p7h_w);
 | 
			
		||||
 | 
			
		||||
  in[0] = _mm_packs_epi32(xx_roundn_epi32(v_p0a_d, DCT_CONST_BITS),
 | 
			
		||||
                          xx_roundn_epi32(v_p0b_d, DCT_CONST_BITS));
 | 
			
		||||
  in[1] = _mm_packs_epi32(xx_roundn_epi32(v_p1a_d, DCT_CONST_BITS),
 | 
			
		||||
                          xx_roundn_epi32(v_p1b_d, DCT_CONST_BITS));
 | 
			
		||||
  in[2] = _mm_packs_epi32(xx_roundn_epi32(v_p2a_d, DCT_CONST_BITS),
 | 
			
		||||
                          xx_roundn_epi32(v_p2b_d, DCT_CONST_BITS));
 | 
			
		||||
  in[3] = _mm_packs_epi32(xx_roundn_epi32(v_p3a_d, DCT_CONST_BITS),
 | 
			
		||||
                          xx_roundn_epi32(v_p3b_d, DCT_CONST_BITS));
 | 
			
		||||
  in[4] = _mm_packs_epi32(xx_roundn_epi32(v_p4a_d, DCT_CONST_BITS),
 | 
			
		||||
                          xx_roundn_epi32(v_p4b_d, DCT_CONST_BITS));
 | 
			
		||||
  in[5] = _mm_packs_epi32(xx_roundn_epi32(v_p5a_d, DCT_CONST_BITS),
 | 
			
		||||
                          xx_roundn_epi32(v_p5b_d, DCT_CONST_BITS));
 | 
			
		||||
  in[6] = _mm_packs_epi32(xx_roundn_epi32(v_p6a_d, DCT_CONST_BITS),
 | 
			
		||||
                          xx_roundn_epi32(v_p6b_d, DCT_CONST_BITS));
 | 
			
		||||
  in[7] = _mm_packs_epi32(xx_roundn_epi32(v_p7a_d, DCT_CONST_BITS),
 | 
			
		||||
                          xx_roundn_epi32(v_p7b_d, DCT_CONST_BITS));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in,
 | 
			
		||||
                                    int stride, int flipud, int fliplr) {
 | 
			
		||||
  // Load 2 8x8 blocks
 | 
			
		||||
  const int16_t *t = input;
 | 
			
		||||
  const int16_t *b = input + 8 * stride;
 | 
			
		||||
 | 
			
		||||
  if (flipud) {
 | 
			
		||||
    const int16_t *const tmp = t;
 | 
			
		||||
    t = b;
 | 
			
		||||
    b = tmp;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  load_buffer_8x8(t, in, stride, flipud, fliplr);
 | 
			
		||||
  scale_sqrt2_8x8(in);
 | 
			
		||||
  load_buffer_8x8(b, in + 8, stride, flipud, fliplr);
 | 
			
		||||
  scale_sqrt2_8x8(in + 8);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
 | 
			
		||||
                      int tx_type) {
 | 
			
		||||
  __m128i in[16];
 | 
			
		||||
 | 
			
		||||
  __m128i *const t = in;      // Alias to top 8x8 sub block
 | 
			
		||||
  __m128i *const b = in + 8;  // Alias to bottom 8x8 sub block
 | 
			
		||||
 | 
			
		||||
  switch (tx_type) {
 | 
			
		||||
    case DCT_DCT:
 | 
			
		||||
      load_buffer_8x16(input, in, stride, 0, 0);
 | 
			
		||||
      fdct16_8col(in);
 | 
			
		||||
      array_transpose_8x8(t, t);
 | 
			
		||||
      array_transpose_8x8(b, b);
 | 
			
		||||
      fdct8_sse2(t);
 | 
			
		||||
      fdct8_sse2(b);
 | 
			
		||||
      break;
 | 
			
		||||
    case ADST_DCT:
 | 
			
		||||
      load_buffer_8x16(input, in, stride, 0, 0);
 | 
			
		||||
      fadst16_8col(in);
 | 
			
		||||
      array_transpose_8x8(t, t);
 | 
			
		||||
      array_transpose_8x8(b, b);
 | 
			
		||||
      fdct8_sse2(t);
 | 
			
		||||
      fdct8_sse2(b);
 | 
			
		||||
      break;
 | 
			
		||||
    case DCT_ADST:
 | 
			
		||||
      load_buffer_8x16(input, in, stride, 0, 0);
 | 
			
		||||
      fdct16_8col(in);
 | 
			
		||||
      array_transpose_8x8(t, t);
 | 
			
		||||
      array_transpose_8x8(b, b);
 | 
			
		||||
      fadst8_sse2(t);
 | 
			
		||||
      fadst8_sse2(b);
 | 
			
		||||
      break;
 | 
			
		||||
    case ADST_ADST:
 | 
			
		||||
      load_buffer_8x16(input, in, stride, 0, 0);
 | 
			
		||||
      fadst16_8col(in);
 | 
			
		||||
      array_transpose_8x8(t, t);
 | 
			
		||||
      array_transpose_8x8(b, b);
 | 
			
		||||
      fadst8_sse2(t);
 | 
			
		||||
      fadst8_sse2(b);
 | 
			
		||||
      break;
 | 
			
		||||
#if CONFIG_EXT_TX
 | 
			
		||||
    case FLIPADST_DCT:
 | 
			
		||||
      load_buffer_8x16(input, in, stride, 1, 0);
 | 
			
		||||
      fadst16_8col(in);
 | 
			
		||||
      array_transpose_8x8(t, t);
 | 
			
		||||
      array_transpose_8x8(b, b);
 | 
			
		||||
      fdct8_sse2(t);
 | 
			
		||||
      fdct8_sse2(b);
 | 
			
		||||
      break;
 | 
			
		||||
    case DCT_FLIPADST:
 | 
			
		||||
      load_buffer_8x16(input, in, stride, 0, 1);
 | 
			
		||||
      fdct16_8col(in);
 | 
			
		||||
      array_transpose_8x8(t, t);
 | 
			
		||||
      array_transpose_8x8(b, b);
 | 
			
		||||
      fadst8_sse2(t);
 | 
			
		||||
      fadst8_sse2(b);
 | 
			
		||||
      break;
 | 
			
		||||
    case FLIPADST_FLIPADST:
 | 
			
		||||
      load_buffer_8x16(input, in, stride, 1, 1);
 | 
			
		||||
      fadst16_8col(in);
 | 
			
		||||
      array_transpose_8x8(t, t);
 | 
			
		||||
      array_transpose_8x8(b, b);
 | 
			
		||||
      fadst8_sse2(t);
 | 
			
		||||
      fadst8_sse2(b);
 | 
			
		||||
      break;
 | 
			
		||||
    case ADST_FLIPADST:
 | 
			
		||||
      load_buffer_8x16(input, in, stride, 0, 1);
 | 
			
		||||
      fadst16_8col(in);
 | 
			
		||||
      array_transpose_8x8(t, t);
 | 
			
		||||
      array_transpose_8x8(b, b);
 | 
			
		||||
      fadst8_sse2(t);
 | 
			
		||||
      fadst8_sse2(b);
 | 
			
		||||
      break;
 | 
			
		||||
    case FLIPADST_ADST:
 | 
			
		||||
      load_buffer_8x16(input, in, stride, 1, 0);
 | 
			
		||||
      fadst16_8col(in);
 | 
			
		||||
      array_transpose_8x8(t, t);
 | 
			
		||||
      array_transpose_8x8(b, b);
 | 
			
		||||
      fadst8_sse2(t);
 | 
			
		||||
      fadst8_sse2(b);
 | 
			
		||||
      break;
 | 
			
		||||
    case IDTX:
 | 
			
		||||
      load_buffer_8x16(input, in, stride, 0, 0);
 | 
			
		||||
      fidtx16_8col(in);
 | 
			
		||||
      array_transpose_8x8(t, t);
 | 
			
		||||
      array_transpose_8x8(b, b);
 | 
			
		||||
      fidtx8_sse2(t);
 | 
			
		||||
      fidtx8_sse2(b);
 | 
			
		||||
      break;
 | 
			
		||||
    case V_DCT:
 | 
			
		||||
      load_buffer_8x16(input, in, stride, 0, 0);
 | 
			
		||||
      fdct16_8col(in);
 | 
			
		||||
      array_transpose_8x8(t, t);
 | 
			
		||||
      array_transpose_8x8(b, b);
 | 
			
		||||
      fidtx8_sse2(t);
 | 
			
		||||
      fidtx8_sse2(b);
 | 
			
		||||
      break;
 | 
			
		||||
    case H_DCT:
 | 
			
		||||
      load_buffer_8x16(input, in, stride, 0, 0);
 | 
			
		||||
      fidtx16_8col(in);
 | 
			
		||||
      array_transpose_8x8(t, t);
 | 
			
		||||
      array_transpose_8x8(b, b);
 | 
			
		||||
      fdct8_sse2(t);
 | 
			
		||||
      fdct8_sse2(b);
 | 
			
		||||
      break;
 | 
			
		||||
    case V_ADST:
 | 
			
		||||
      load_buffer_8x16(input, in, stride, 0, 0);
 | 
			
		||||
      fadst16_8col(in);
 | 
			
		||||
      array_transpose_8x8(t, t);
 | 
			
		||||
      array_transpose_8x8(b, b);
 | 
			
		||||
      fidtx8_sse2(t);
 | 
			
		||||
      fidtx8_sse2(b);
 | 
			
		||||
      break;
 | 
			
		||||
    case H_ADST:
 | 
			
		||||
      load_buffer_8x16(input, in, stride, 0, 0);
 | 
			
		||||
      fidtx16_8col(in);
 | 
			
		||||
      array_transpose_8x8(t, t);
 | 
			
		||||
      array_transpose_8x8(b, b);
 | 
			
		||||
      fadst8_sse2(t);
 | 
			
		||||
      fadst8_sse2(b);
 | 
			
		||||
      break;
 | 
			
		||||
    case V_FLIPADST:
 | 
			
		||||
      load_buffer_8x16(input, in, stride, 1, 0);
 | 
			
		||||
      fadst16_8col(in);
 | 
			
		||||
      array_transpose_8x8(t, t);
 | 
			
		||||
      array_transpose_8x8(b, b);
 | 
			
		||||
      fidtx8_sse2(t);
 | 
			
		||||
      fidtx8_sse2(b);
 | 
			
		||||
      break;
 | 
			
		||||
    case H_FLIPADST:
 | 
			
		||||
      load_buffer_8x16(input, in, stride, 0, 1);
 | 
			
		||||
      fidtx16_8col(in);
 | 
			
		||||
      array_transpose_8x8(t, t);
 | 
			
		||||
      array_transpose_8x8(b, b);
 | 
			
		||||
      fadst8_sse2(t);
 | 
			
		||||
      fadst8_sse2(b);
 | 
			
		||||
      break;
 | 
			
		||||
#endif  // CONFIG_EXT_TX
 | 
			
		||||
    default: assert(0); break;
 | 
			
		||||
  }
 | 
			
		||||
  right_shift_8x8(t, 2);
 | 
			
		||||
  right_shift_8x8(b, 2);
 | 
			
		||||
  write_buffer_8x8(output, t, 8);
 | 
			
		||||
  write_buffer_8x8(output + 64, b, 8);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void load_buffer_16x8(const int16_t *input, __m128i *in,
 | 
			
		||||
                                    int stride, int flipud, int fliplr) {
 | 
			
		||||
  // Load 2 8x8 blocks
 | 
			
		||||
  const int16_t *l = input;
 | 
			
		||||
  const int16_t *r = input + 8;
 | 
			
		||||
 | 
			
		||||
  if (fliplr) {
 | 
			
		||||
    const int16_t *const tmp = l;
 | 
			
		||||
    l = r;
 | 
			
		||||
    r = tmp;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // load first 8 columns
 | 
			
		||||
  load_buffer_8x8(l, in, stride, flipud, fliplr);
 | 
			
		||||
  scale_sqrt2_8x8(in);
 | 
			
		||||
  load_buffer_8x8(r, in + 8, stride, flipud, fliplr);
 | 
			
		||||
  scale_sqrt2_8x8(in + 8);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride,
 | 
			
		||||
                      int tx_type) {
 | 
			
		||||
  __m128i in[16];
 | 
			
		||||
 | 
			
		||||
  __m128i *const l = in;      // Alias to left 8x8 sub block
 | 
			
		||||
  __m128i *const r = in + 8;  // Alias to right 8x8 sub block, which we store
 | 
			
		||||
                              // in the second half of the array
 | 
			
		||||
 | 
			
		||||
  switch (tx_type) {
 | 
			
		||||
    case DCT_DCT:
 | 
			
		||||
      load_buffer_16x8(input, in, stride, 0, 0);
 | 
			
		||||
      fdct8_sse2(l);
 | 
			
		||||
      fdct8_sse2(r);
 | 
			
		||||
      fdct16_8col(in);
 | 
			
		||||
      break;
 | 
			
		||||
    case ADST_DCT:
 | 
			
		||||
      load_buffer_16x8(input, in, stride, 0, 0);
 | 
			
		||||
      fadst8_sse2(l);
 | 
			
		||||
      fadst8_sse2(r);
 | 
			
		||||
      fdct16_8col(in);
 | 
			
		||||
      break;
 | 
			
		||||
    case DCT_ADST:
 | 
			
		||||
      load_buffer_16x8(input, in, stride, 0, 0);
 | 
			
		||||
      fdct8_sse2(l);
 | 
			
		||||
      fdct8_sse2(r);
 | 
			
		||||
      fadst16_8col(in);
 | 
			
		||||
      break;
 | 
			
		||||
    case ADST_ADST:
 | 
			
		||||
      load_buffer_16x8(input, in, stride, 0, 0);
 | 
			
		||||
      fadst8_sse2(l);
 | 
			
		||||
      fadst8_sse2(r);
 | 
			
		||||
      fadst16_8col(in);
 | 
			
		||||
      break;
 | 
			
		||||
#if CONFIG_EXT_TX
 | 
			
		||||
    case FLIPADST_DCT:
 | 
			
		||||
      load_buffer_16x8(input, in, stride, 1, 0);
 | 
			
		||||
      fadst8_sse2(l);
 | 
			
		||||
      fadst8_sse2(r);
 | 
			
		||||
      fdct16_8col(in);
 | 
			
		||||
      break;
 | 
			
		||||
    case DCT_FLIPADST:
 | 
			
		||||
      load_buffer_16x8(input, in, stride, 0, 1);
 | 
			
		||||
      fdct8_sse2(l);
 | 
			
		||||
      fdct8_sse2(r);
 | 
			
		||||
      fadst16_8col(in);
 | 
			
		||||
      break;
 | 
			
		||||
    case FLIPADST_FLIPADST:
 | 
			
		||||
      load_buffer_16x8(input, in, stride, 1, 1);
 | 
			
		||||
      fadst8_sse2(l);
 | 
			
		||||
      fadst8_sse2(r);
 | 
			
		||||
      fadst16_8col(in);
 | 
			
		||||
      break;
 | 
			
		||||
    case ADST_FLIPADST:
 | 
			
		||||
      load_buffer_16x8(input, in, stride, 0, 1);
 | 
			
		||||
      fadst8_sse2(l);
 | 
			
		||||
      fadst8_sse2(r);
 | 
			
		||||
      fadst16_8col(in);
 | 
			
		||||
      break;
 | 
			
		||||
    case FLIPADST_ADST:
 | 
			
		||||
      load_buffer_16x8(input, in, stride, 1, 0);
 | 
			
		||||
      fadst8_sse2(l);
 | 
			
		||||
      fadst8_sse2(r);
 | 
			
		||||
      fadst16_8col(in);
 | 
			
		||||
      break;
 | 
			
		||||
    case IDTX:
 | 
			
		||||
      load_buffer_16x8(input, in, stride, 0, 0);
 | 
			
		||||
      fidtx8_sse2(l);
 | 
			
		||||
      fidtx8_sse2(r);
 | 
			
		||||
      fidtx16_8col(in);
 | 
			
		||||
      break;
 | 
			
		||||
    case V_DCT:
 | 
			
		||||
      load_buffer_16x8(input, in, stride, 0, 0);
 | 
			
		||||
      fdct8_sse2(l);
 | 
			
		||||
      fdct8_sse2(r);
 | 
			
		||||
      fidtx16_8col(in);
 | 
			
		||||
      break;
 | 
			
		||||
    case H_DCT:
 | 
			
		||||
      load_buffer_16x8(input, in, stride, 0, 0);
 | 
			
		||||
      fidtx8_sse2(l);
 | 
			
		||||
      fidtx8_sse2(r);
 | 
			
		||||
      fdct16_8col(in);
 | 
			
		||||
      break;
 | 
			
		||||
    case V_ADST:
 | 
			
		||||
      load_buffer_16x8(input, in, stride, 0, 0);
 | 
			
		||||
      fadst8_sse2(l);
 | 
			
		||||
      fadst8_sse2(r);
 | 
			
		||||
      fidtx16_8col(in);
 | 
			
		||||
      break;
 | 
			
		||||
    case H_ADST:
 | 
			
		||||
      load_buffer_16x8(input, in, stride, 0, 0);
 | 
			
		||||
      fidtx8_sse2(l);
 | 
			
		||||
      fidtx8_sse2(r);
 | 
			
		||||
      fadst16_8col(in);
 | 
			
		||||
      break;
 | 
			
		||||
    case V_FLIPADST:
 | 
			
		||||
      load_buffer_16x8(input, in, stride, 1, 0);
 | 
			
		||||
      fadst8_sse2(l);
 | 
			
		||||
      fadst8_sse2(r);
 | 
			
		||||
      fidtx16_8col(in);
 | 
			
		||||
      break;
 | 
			
		||||
    case H_FLIPADST:
 | 
			
		||||
      load_buffer_16x8(input, in, stride, 0, 1);
 | 
			
		||||
      fidtx8_sse2(l);
 | 
			
		||||
      fidtx8_sse2(r);
 | 
			
		||||
      fadst16_8col(in);
 | 
			
		||||
      break;
 | 
			
		||||
#endif  // CONFIG_EXT_TX
 | 
			
		||||
    default: assert(0); break;
 | 
			
		||||
  }
 | 
			
		||||
  array_transpose_8x8(l, l);
 | 
			
		||||
  array_transpose_8x8(r, r);
 | 
			
		||||
  right_shift_8x8(l, 2);
 | 
			
		||||
  right_shift_8x8(r, 2);
 | 
			
		||||
  write_buffer_8x8(output, l, 16);
 | 
			
		||||
  write_buffer_8x8(output + 8, r, 16);
 | 
			
		||||
}
 | 
			
		||||
#endif  // CONFIG_EXT_TX
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										95
									
								
								test/av1_fht16x8_test.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										95
									
								
								test/av1_fht16x8_test.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,95 @@
 | 
			
		||||
/*
 | 
			
		||||
 *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
 | 
			
		||||
 *
 | 
			
		||||
 *  Use of this source code is governed by a BSD-style license
 | 
			
		||||
 *  that can be found in the LICENSE file in the root of the source
 | 
			
		||||
 *  tree. An additional intellectual property rights grant can be found
 | 
			
		||||
 *  in the file PATENTS.  All contributing project authors may
 | 
			
		||||
 *  be found in the AUTHORS file in the root of the source tree.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#include "third_party/googletest/src/include/gtest/gtest.h"
 | 
			
		||||
 | 
			
		||||
#include "./aom_dsp_rtcd.h"
 | 
			
		||||
#include "./av1_rtcd.h"
 | 
			
		||||
 | 
			
		||||
#include "aom_ports/mem.h"
 | 
			
		||||
#include "test/acm_random.h"
 | 
			
		||||
#include "test/clear_system_state.h"
 | 
			
		||||
#include "test/register_state_check.h"
 | 
			
		||||
#include "test/transform_test_base.h"
 | 
			
		||||
#include "test/util.h"
 | 
			
		||||
 | 
			
		||||
using libaom_test::ACMRandom;
 | 
			
		||||
 | 
			
		||||
namespace {
 | 
			
		||||
typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
 | 
			
		||||
                        int tx_type);
 | 
			
		||||
using std::tr1::tuple;
 | 
			
		||||
using libaom_test::FhtFunc;
 | 
			
		||||
typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht16x8Param;
 | 
			
		||||
 | 
			
		||||
void fht16x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
 | 
			
		||||
  av1_fht16x8_c(in, out, stride, tx_type);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
class AV1Trans16x8HT : public libaom_test::TransformTestBase,
 | 
			
		||||
                       public ::testing::TestWithParam<Ht16x8Param> {
 | 
			
		||||
 public:
 | 
			
		||||
  virtual ~AV1Trans16x8HT() {}
 | 
			
		||||
 | 
			
		||||
  virtual void SetUp() {
 | 
			
		||||
    fwd_txfm_ = GET_PARAM(0);
 | 
			
		||||
    inv_txfm_ = GET_PARAM(1);
 | 
			
		||||
    tx_type_ = GET_PARAM(2);
 | 
			
		||||
    pitch_ = 16;
 | 
			
		||||
    fwd_txfm_ref = fht16x8_ref;
 | 
			
		||||
    bit_depth_ = GET_PARAM(3);
 | 
			
		||||
    mask_ = (1 << bit_depth_) - 1;
 | 
			
		||||
    num_coeffs_ = GET_PARAM(4);
 | 
			
		||||
  }
 | 
			
		||||
  virtual void TearDown() { libaom_test::ClearSystemState(); }
 | 
			
		||||
 | 
			
		||||
 protected:
 | 
			
		||||
  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
 | 
			
		||||
    fwd_txfm_(in, out, stride, tx_type_);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
 | 
			
		||||
    inv_txfm_(out, dst, stride, tx_type_);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  FhtFunc fwd_txfm_;
 | 
			
		||||
  IhtFunc inv_txfm_;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
TEST_P(AV1Trans16x8HT, CoeffCheck) { RunCoeffCheck(); }
 | 
			
		||||
 | 
			
		||||
using std::tr1::make_tuple;
 | 
			
		||||
 | 
			
		||||
#if HAVE_SSE2
 | 
			
		||||
const Ht16x8Param kArrayHt16x8Param_sse2[] = {
 | 
			
		||||
  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 0, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 1, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 2, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 3, AOM_BITS_8, 128),
 | 
			
		||||
#if CONFIG_EXT_TX
 | 
			
		||||
  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 4, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 5, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 6, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 7, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 8, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 9, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 10, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 11, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 12, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 13, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 14, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 15, AOM_BITS_8, 128)
 | 
			
		||||
#endif  // CONFIG_EXT_TX
 | 
			
		||||
};
 | 
			
		||||
INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans16x8HT,
 | 
			
		||||
                        ::testing::ValuesIn(kArrayHt16x8Param_sse2));
 | 
			
		||||
#endif  // HAVE_SSE2
 | 
			
		||||
 | 
			
		||||
}  // namespace
 | 
			
		||||
							
								
								
									
										95
									
								
								test/av1_fht8x16_test.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										95
									
								
								test/av1_fht8x16_test.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,95 @@
 | 
			
		||||
/*
 | 
			
		||||
 *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
 | 
			
		||||
 *
 | 
			
		||||
 *  Use of this source code is governed by a BSD-style license
 | 
			
		||||
 *  that can be found in the LICENSE file in the root of the source
 | 
			
		||||
 *  tree. An additional intellectual property rights grant can be found
 | 
			
		||||
 *  in the file PATENTS.  All contributing project authors may
 | 
			
		||||
 *  be found in the AUTHORS file in the root of the source tree.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#include "third_party/googletest/src/include/gtest/gtest.h"
 | 
			
		||||
 | 
			
		||||
#include "./aom_dsp_rtcd.h"
 | 
			
		||||
#include "./av1_rtcd.h"
 | 
			
		||||
 | 
			
		||||
#include "aom_ports/mem.h"
 | 
			
		||||
#include "test/acm_random.h"
 | 
			
		||||
#include "test/clear_system_state.h"
 | 
			
		||||
#include "test/register_state_check.h"
 | 
			
		||||
#include "test/transform_test_base.h"
 | 
			
		||||
#include "test/util.h"
 | 
			
		||||
 | 
			
		||||
using libaom_test::ACMRandom;
 | 
			
		||||
 | 
			
		||||
namespace {
 | 
			
		||||
typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
 | 
			
		||||
                        int tx_type);
 | 
			
		||||
using std::tr1::tuple;
 | 
			
		||||
using libaom_test::FhtFunc;
 | 
			
		||||
typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht8x16Param;
 | 
			
		||||
 | 
			
		||||
void fht8x16_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
 | 
			
		||||
  av1_fht8x16_c(in, out, stride, tx_type);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
class AV1Trans8x16HT : public libaom_test::TransformTestBase,
 | 
			
		||||
                       public ::testing::TestWithParam<Ht8x16Param> {
 | 
			
		||||
 public:
 | 
			
		||||
  virtual ~AV1Trans8x16HT() {}
 | 
			
		||||
 | 
			
		||||
  virtual void SetUp() {
 | 
			
		||||
    fwd_txfm_ = GET_PARAM(0);
 | 
			
		||||
    inv_txfm_ = GET_PARAM(1);
 | 
			
		||||
    tx_type_ = GET_PARAM(2);
 | 
			
		||||
    pitch_ = 8;
 | 
			
		||||
    fwd_txfm_ref = fht8x16_ref;
 | 
			
		||||
    bit_depth_ = GET_PARAM(3);
 | 
			
		||||
    mask_ = (1 << bit_depth_) - 1;
 | 
			
		||||
    num_coeffs_ = GET_PARAM(4);
 | 
			
		||||
  }
 | 
			
		||||
  virtual void TearDown() { libaom_test::ClearSystemState(); }
 | 
			
		||||
 | 
			
		||||
 protected:
 | 
			
		||||
  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
 | 
			
		||||
    fwd_txfm_(in, out, stride, tx_type_);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
 | 
			
		||||
    inv_txfm_(out, dst, stride, tx_type_);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  FhtFunc fwd_txfm_;
 | 
			
		||||
  IhtFunc inv_txfm_;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
TEST_P(AV1Trans8x16HT, CoeffCheck) { RunCoeffCheck(); }
 | 
			
		||||
 | 
			
		||||
using std::tr1::make_tuple;
 | 
			
		||||
 | 
			
		||||
#if HAVE_SSE2
 | 
			
		||||
const Ht8x16Param kArrayHt8x16Param_sse2[] = {
 | 
			
		||||
  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 0, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 1, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 2, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 3, AOM_BITS_8, 128),
 | 
			
		||||
#if CONFIG_EXT_TX
 | 
			
		||||
  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 4, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 5, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 6, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 7, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 8, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 9, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 10, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 11, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 12, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 13, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 14, AOM_BITS_8, 128),
 | 
			
		||||
  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 15, AOM_BITS_8, 128)
 | 
			
		||||
#endif  // CONFIG_EXT_TX
 | 
			
		||||
};
 | 
			
		||||
INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans8x16HT,
 | 
			
		||||
                        ::testing::ValuesIn(kArrayHt8x16Param_sse2));
 | 
			
		||||
#endif  // HAVE_SSE2
 | 
			
		||||
 | 
			
		||||
}  // namespace
 | 
			
		||||
@@ -132,6 +132,10 @@ LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_dct_test.cc
 | 
			
		||||
LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht4x4_test.cc
 | 
			
		||||
LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht8x8_test.cc
 | 
			
		||||
LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht16x16_test.cc
 | 
			
		||||
ifeq ($(CONFIG_EXT_TX),yes)
 | 
			
		||||
LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht8x16_test.cc
 | 
			
		||||
LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht16x8_test.cc
 | 
			
		||||
endif
 | 
			
		||||
LIBAOM_TEST_SRCS-$(CONFIG_ANS)          += av1_ans_test.cc
 | 
			
		||||
LIBAOM_TEST_SRCS-$(CONFIG_EXT_TILE)     += av1_ext_tile_test.cc
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -137,7 +137,7 @@ class TransformTestBase {
 | 
			
		||||
 | 
			
		||||
      // The minimum quant value is 4.
 | 
			
		||||
      for (int j = 0; j < num_coeffs_; ++j) {
 | 
			
		||||
        EXPECT_EQ(output_block[j], output_ref_block[j])
 | 
			
		||||
        ASSERT_EQ(output_block[j], output_ref_block[j])
 | 
			
		||||
            << "Error: not bit-exact result at index: " << j
 | 
			
		||||
            << " at test block: " << i;
 | 
			
		||||
      }
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user