/* * Copyright (c) 2016 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "./av1_rtcd.h" #include "av1/common/enums.h" #include "av1/common/av1_txfm.h" #include "av1/common/x86/av1_txfm1d_sse4.h" static INLINE void int16_array_with_stride_to_int32_array_without_stride( const int16_t *input, int stride, int32_t *output, int txfm1d_size) { int r, c; for (r = 0; r < txfm1d_size; r++) { for (c = 0; c < txfm1d_size; c++) { output[r * txfm1d_size + c] = (int32_t)input[r * stride + c]; } } } typedef void (*TxfmFuncSSE2)(const __m128i *input, __m128i *output, const int8_t *cos_bit, const int8_t *stage_range); static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { switch (txfm_type) { case TXFM_TYPE_DCT32: return av1_fdct32_new_sse4_1; break; case TXFM_TYPE_ADST32: return av1_fadst32_new_sse4_1; break; default: assert(0); } return NULL; } static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, int32_t *txfm_buf) { const int txfm_size = cfg->txfm_size; const int8_t *shift = cfg->shift; const int8_t *stage_range_col = cfg->stage_range_col; const int8_t *stage_range_row = cfg->stage_range_row; const int8_t *cos_bit_col = cfg->cos_bit_col; const int8_t *cos_bit_row = cfg->cos_bit_row; const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row); __m128i *buf_128 = (__m128i *)txfm_buf; __m128i *out_128 = (__m128i *)output; int num_per_128 = 4; int txfm2d_size_128 = txfm_size * txfm_size / num_per_128; int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf, txfm_size); round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]); txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col); round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]); transpose_32(txfm_size, out_128, buf_128); txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row); round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]); transpose_32(txfm_size, buf_128, out_128); } void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd) { int32_t txfm_buf[1024]; TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_32X32); (void)bd; fwd_txfm2d_sse4_1(input, output, stride, cfg.cfg, txfm_buf); } void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, int stride, int tx_type, int bd) { int32_t txfm_buf[4096]; TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_64x64_cfg(tx_type); (void)bd; fwd_txfm2d_sse4_1(input, output, stride, cfg.cfg, txfm_buf); }