Merge "Fix avx2 16x16/32x32 fwd txfm coeff output on HBD" into nextgenv2
This commit is contained in:
		@@ -205,6 +205,7 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32x32_impl_sse2.h
 | 
			
		||||
ifeq ($(ARCH_X86_64),yes)
 | 
			
		||||
DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3_x86_64.asm
 | 
			
		||||
endif
 | 
			
		||||
DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.h
 | 
			
		||||
DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
 | 
			
		||||
DSP_SRCS-$(HAVE_AVX2)   += x86/txfm_common_avx2.h
 | 
			
		||||
DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
 | 
			
		||||
 
 | 
			
		||||
@@ -17,6 +17,14 @@
 | 
			
		||||
#undef FDCT32x32_2D_AVX2
 | 
			
		||||
#undef FDCT32x32_HIGH_PRECISION
 | 
			
		||||
 | 
			
		||||
// TODO(luoyi): The following macro hides an error. The second parameter type of
 | 
			
		||||
// function,
 | 
			
		||||
//   void FDCT32x32_2D_AVX2(const int16_t *, int16_t*, int);
 | 
			
		||||
// is different from the one in,
 | 
			
		||||
//   void aom_fdct32x32_avx2(const int16_t *, tran_low_t*, int);
 | 
			
		||||
// In CONFIG_AOM_HIGHBITDEPTH=1 build, the second parameter type should be
 | 
			
		||||
// int32_t.
 | 
			
		||||
// This function should be removed after av1_fht32x32 scaling/rounding fix.
 | 
			
		||||
#define FDCT32x32_2D_AVX2 aom_fdct32x32_avx2
 | 
			
		||||
#define FDCT32x32_HIGH_PRECISION 1
 | 
			
		||||
#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h"  // NOLINT
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										35
									
								
								aom_dsp/x86/fwd_txfm_avx2.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								aom_dsp/x86/fwd_txfm_avx2.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,35 @@
 | 
			
		||||
/*
 | 
			
		||||
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 | 
			
		||||
 *
 | 
			
		||||
 * This source code is subject to the terms of the BSD 2 Clause License and
 | 
			
		||||
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 | 
			
		||||
 * was not distributed with this source code in the LICENSE file, you can
 | 
			
		||||
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 | 
			
		||||
 * Media Patent License 1.0 was not distributed with this source code in the
 | 
			
		||||
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#ifndef AOM_DSP_X86_FWD_TXFM_AVX2_H
 | 
			
		||||
#define AOM_DSP_X86_FWD_TXFM_AVX2_H
 | 
			
		||||
 | 
			
		||||
#include "./aom_config.h"
 | 
			
		||||
 | 
			
		||||
static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) {
 | 
			
		||||
#if CONFIG_AOM_HIGHBITDEPTH
 | 
			
		||||
  const __m256i zero = _mm256_setzero_si256();
 | 
			
		||||
  const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff);
 | 
			
		||||
 | 
			
		||||
  __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign);
 | 
			
		||||
  __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign);
 | 
			
		||||
 | 
			
		||||
  __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20);
 | 
			
		||||
  __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31);
 | 
			
		||||
 | 
			
		||||
  _mm256_storeu_si256((__m256i *)out, y0);
 | 
			
		||||
  _mm256_storeu_si256((__m256i *)(out + 8), y1);
 | 
			
		||||
#else
 | 
			
		||||
  _mm256_storeu_si256((__m256i *)out, *coeff);
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif  // AOM_DSP_X86_FWD_TXFM_AVX2_H
 | 
			
		||||
@@ -14,6 +14,7 @@
 | 
			
		||||
#include "./av1_rtcd.h"
 | 
			
		||||
#include "./aom_dsp_rtcd.h"
 | 
			
		||||
 | 
			
		||||
#include "aom_dsp/x86/fwd_txfm_avx2.h"
 | 
			
		||||
#include "aom_dsp/txfm_common.h"
 | 
			
		||||
#include "aom_dsp/x86/txfm_common_avx2.h"
 | 
			
		||||
 | 
			
		||||
@@ -273,24 +274,11 @@ static INLINE void load_buffer_16x16(const int16_t *input, int stride,
 | 
			
		||||
  in[15] = _mm256_slli_epi16(in[15], 2);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void write_buffer_16x16(const __m256i *in, int stride,
 | 
			
		||||
                                      tran_low_t *output) {
 | 
			
		||||
  _mm256_storeu_si256((__m256i *)output, in[0]);
 | 
			
		||||
  _mm256_storeu_si256((__m256i *)(output + stride), in[1]);
 | 
			
		||||
  _mm256_storeu_si256((__m256i *)(output + 2 * stride), in[2]);
 | 
			
		||||
  _mm256_storeu_si256((__m256i *)(output + 3 * stride), in[3]);
 | 
			
		||||
  _mm256_storeu_si256((__m256i *)(output + 4 * stride), in[4]);
 | 
			
		||||
  _mm256_storeu_si256((__m256i *)(output + 5 * stride), in[5]);
 | 
			
		||||
  _mm256_storeu_si256((__m256i *)(output + 6 * stride), in[6]);
 | 
			
		||||
  _mm256_storeu_si256((__m256i *)(output + 7 * stride), in[7]);
 | 
			
		||||
  _mm256_storeu_si256((__m256i *)(output + 8 * stride), in[8]);
 | 
			
		||||
  _mm256_storeu_si256((__m256i *)(output + 9 * stride), in[9]);
 | 
			
		||||
  _mm256_storeu_si256((__m256i *)(output + 10 * stride), in[10]);
 | 
			
		||||
  _mm256_storeu_si256((__m256i *)(output + 11 * stride), in[11]);
 | 
			
		||||
  _mm256_storeu_si256((__m256i *)(output + 12 * stride), in[12]);
 | 
			
		||||
  _mm256_storeu_si256((__m256i *)(output + 13 * stride), in[13]);
 | 
			
		||||
  _mm256_storeu_si256((__m256i *)(output + 14 * stride), in[14]);
 | 
			
		||||
  _mm256_storeu_si256((__m256i *)(output + 15 * stride), in[15]);
 | 
			
		||||
static INLINE void write_buffer_16x16(const __m256i *in, tran_low_t *output) {
 | 
			
		||||
  int i;
 | 
			
		||||
  for (i = 0; i < 16; ++i) {
 | 
			
		||||
    storeu_output_avx2(&in[i], output + (i << 4));
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void right_shift_16x16(__m256i *in) {
 | 
			
		||||
@@ -1253,7 +1241,7 @@ void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
 | 
			
		||||
    default: assert(0); break;
 | 
			
		||||
  }
 | 
			
		||||
  mm256_transpose_16x16(in);
 | 
			
		||||
  write_buffer_16x16(in, 16, output);
 | 
			
		||||
  write_buffer_16x16(in, output);
 | 
			
		||||
  _mm256_zeroupper();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -1623,12 +1611,13 @@ static void fdct32_avx2(__m256i *in0, __m256i *in1) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void write_buffer_32x32(const __m256i *in0, const __m256i *in1,
 | 
			
		||||
                                      int stride, tran_low_t *output) {
 | 
			
		||||
                                      tran_low_t *output) {
 | 
			
		||||
  int i = 0;
 | 
			
		||||
  const int stride = 32;
 | 
			
		||||
  tran_low_t *coeff = output;
 | 
			
		||||
  while (i < 32) {
 | 
			
		||||
    _mm256_storeu_si256((__m256i *)coeff, in0[i]);
 | 
			
		||||
    _mm256_storeu_si256((__m256i *)(coeff + 16), in1[i]);
 | 
			
		||||
    storeu_output_avx2(&in0[i], coeff);
 | 
			
		||||
    storeu_output_avx2(&in1[i], coeff + 16);
 | 
			
		||||
    coeff += stride;
 | 
			
		||||
    i += 1;
 | 
			
		||||
  }
 | 
			
		||||
@@ -1885,6 +1874,6 @@ void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride,
 | 
			
		||||
    default: assert(0); break;
 | 
			
		||||
  }
 | 
			
		||||
  nr_right_shift_32x32(in0, in1);
 | 
			
		||||
  write_buffer_32x32(in0, in1, 32, output);
 | 
			
		||||
  write_buffer_32x32(in0, in1, output);
 | 
			
		||||
  _mm256_zeroupper();
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -90,8 +90,14 @@ class AV1Trans32x32HT : public libaom_test::TransformTestBase,
 | 
			
		||||
  IhtFunc inv_txfm_;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// TODO(luoyi): Owing to the range check in DCT_DCT of av1_fht32x32_avx2, as
 | 
			
		||||
// input is out of the range, we use aom_fdct32x32_avx2. However this function
 | 
			
		||||
// does not support CONFIG_AOM_HIGHBITDEPTH. I need to fix the scaling/rounding
 | 
			
		||||
// of av1_fht32x32_avx2 then add this test on CONFIG_AOM_HIGHBITDEPTH.
 | 
			
		||||
#if !CONFIG_AOM_HIGHBITDEPTH
 | 
			
		||||
TEST_P(AV1Trans32x32HT, CoeffCheck) { RunCoeffCheck(); }
 | 
			
		||||
TEST_P(AV1Trans32x32HT, MemCheck) { RunMemCheck(); }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if CONFIG_AOM_HIGHBITDEPTH
 | 
			
		||||
class AV1HighbdTrans32x32HT
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user