3a8c43a479
This change is made in preparation for a subsequent patch which adds acceleration for the highbitdepth transform functions. The highbitdepth transform functions attempt to use 16/32bit sse instructions where possible, but fallback to using the C implementations if potential overflow is detected. For this reason the dct routines are made global so they can be called from the acceleration functions in the subsequent patch. Change-Id: Ia921f191bf6936ccba4f13e8461624b120c1f665 (cherry picked from commit 454342d4e77dbb67f4a3c10f97a57a6fcb46d9a0)
192 lines
7.9 KiB
C
192 lines
7.9 KiB
C
/*
|
|
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#ifndef VP9_COMMON_VP9_IDCT_H_
|
|
#define VP9_COMMON_VP9_IDCT_H_
|
|
|
|
#include <assert.h>
|
|
|
|
#include "./vpx_config.h"
|
|
#include "vp9/common/vp9_common.h"
|
|
#include "vp9/common/vp9_enums.h"
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
// Constants and Macros used by all idct/dct functions
|
|
#define DCT_CONST_BITS 14
|
|
#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1))
|
|
|
|
#define UNIT_QUANT_SHIFT 2
|
|
#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
|
|
|
|
#define pair_set_epi16(a, b) \
|
|
_mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
|
|
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
|
|
|
|
#define dual_set_epi16(a, b) \
|
|
_mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
|
|
(int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
|
|
|
|
// Constants:
|
|
// for (int i = 1; i< 32; ++i)
|
|
// printf("static const int cospi_%d_64 = %.0f;\n", i,
|
|
// round(16384 * cos(i*M_PI/64)));
|
|
// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
|
|
static const tran_high_t cospi_1_64 = 16364;
|
|
static const tran_high_t cospi_2_64 = 16305;
|
|
static const tran_high_t cospi_3_64 = 16207;
|
|
static const tran_high_t cospi_4_64 = 16069;
|
|
static const tran_high_t cospi_5_64 = 15893;
|
|
static const tran_high_t cospi_6_64 = 15679;
|
|
static const tran_high_t cospi_7_64 = 15426;
|
|
static const tran_high_t cospi_8_64 = 15137;
|
|
static const tran_high_t cospi_9_64 = 14811;
|
|
static const tran_high_t cospi_10_64 = 14449;
|
|
static const tran_high_t cospi_11_64 = 14053;
|
|
static const tran_high_t cospi_12_64 = 13623;
|
|
static const tran_high_t cospi_13_64 = 13160;
|
|
static const tran_high_t cospi_14_64 = 12665;
|
|
static const tran_high_t cospi_15_64 = 12140;
|
|
static const tran_high_t cospi_16_64 = 11585;
|
|
static const tran_high_t cospi_17_64 = 11003;
|
|
static const tran_high_t cospi_18_64 = 10394;
|
|
static const tran_high_t cospi_19_64 = 9760;
|
|
static const tran_high_t cospi_20_64 = 9102;
|
|
static const tran_high_t cospi_21_64 = 8423;
|
|
static const tran_high_t cospi_22_64 = 7723;
|
|
static const tran_high_t cospi_23_64 = 7005;
|
|
static const tran_high_t cospi_24_64 = 6270;
|
|
static const tran_high_t cospi_25_64 = 5520;
|
|
static const tran_high_t cospi_26_64 = 4756;
|
|
static const tran_high_t cospi_27_64 = 3981;
|
|
static const tran_high_t cospi_28_64 = 3196;
|
|
static const tran_high_t cospi_29_64 = 2404;
|
|
static const tran_high_t cospi_30_64 = 1606;
|
|
static const tran_high_t cospi_31_64 = 804;
|
|
|
|
// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3
|
|
static const tran_high_t sinpi_1_9 = 5283;
|
|
static const tran_high_t sinpi_2_9 = 9929;
|
|
static const tran_high_t sinpi_3_9 = 13377;
|
|
static const tran_high_t sinpi_4_9 = 15212;
|
|
|
|
static INLINE tran_low_t check_range(tran_high_t input) {
|
|
#if CONFIG_VP9_HIGHBITDEPTH
|
|
// For valid highbitdepth VP9 streams, intermediate stage coefficients will
|
|
// stay within the ranges:
|
|
// - 8 bit: signed 16 bit integer
|
|
// - 10 bit: signed 18 bit integer
|
|
// - 12 bit: signed 20 bit integer
|
|
#elif CONFIG_COEFFICIENT_RANGE_CHECKING
|
|
// For valid VP9 input streams, intermediate stage coefficients should always
|
|
// stay within the range of a signed 16 bit integer. Coefficients can go out
|
|
// of this range for invalid/corrupt VP9 streams. However, strictly checking
|
|
// this range for every intermediate coefficient can burdensome for a decoder,
|
|
// therefore the following assertion is only enabled when configured with
|
|
// --enable-coefficient-range-checking.
|
|
assert(INT16_MIN <= input);
|
|
assert(input <= INT16_MAX);
|
|
#endif
|
|
return (tran_low_t)input;
|
|
}
|
|
|
|
static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
|
|
tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
|
|
return check_range(rv);
|
|
}
|
|
|
|
typedef void (*transform_1d)(const tran_low_t*, tran_low_t*);
|
|
|
|
typedef struct {
|
|
transform_1d cols, rows; // vertical and horizontal
|
|
} transform_2d;
|
|
|
|
#if CONFIG_VP9_HIGHBITDEPTH
|
|
typedef void (*highbd_transform_1d)(const tran_low_t*, tran_low_t*, int bd);
|
|
|
|
typedef struct {
|
|
highbd_transform_1d cols, rows; // vertical and horizontal
|
|
} highbd_transform_2d;
|
|
#endif // CONFIG_VP9_HIGHBITDEPTH
|
|
|
|
#if CONFIG_EMULATE_HARDWARE
|
|
// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
|
|
// non-normative method to handle overflows. A stream that causes
|
|
// overflows in the inverse transform is considered invalid in VP9,
|
|
// and a hardware implementer is free to choose any reasonable
|
|
// method to handle overflows. However to aid in hardware
|
|
// verification they can use a specific implementation of the
|
|
// WRAPLOW() macro below that is identical to their intended
|
|
// hardware implementation (and also use configure options to trigger
|
|
// the C-implementation of the transform).
|
|
//
|
|
// The particular WRAPLOW implementation below performs strict
|
|
// overflow wrapping to match common hardware implementations.
|
|
// bd of 8 uses trans_low with 16bits, need to remove 16bits
|
|
// bd of 10 uses trans_low with 18bits, need to remove 14bits
|
|
// bd of 12 uses trans_low with 20bits, need to remove 12bits
|
|
// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
|
|
#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
|
|
#else
|
|
#define WRAPLOW(x, bd) (x)
|
|
#endif // CONFIG_EMULATE_HARDWARE
|
|
|
|
void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
|
|
int eob);
|
|
void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
|
|
int eob);
|
|
void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
|
|
int eob);
|
|
void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, int
|
|
eob);
|
|
void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
|
|
int eob);
|
|
|
|
void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
|
|
int stride, int eob);
|
|
void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
|
|
int stride, int eob);
|
|
void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
|
|
int stride, int eob);
|
|
|
|
#if CONFIG_VP9_HIGHBITDEPTH
|
|
void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd);
|
|
void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd);
|
|
void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd);
|
|
void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
|
|
int eob, int bd);
|
|
void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
|
|
int eob, int bd);
|
|
void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
|
|
int eob, int bd);
|
|
void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
|
|
int stride, int eob, int bd);
|
|
void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
|
|
int stride, int eob, int bd);
|
|
void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
|
|
uint8_t *dest, int stride, int eob, int bd);
|
|
void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
|
|
uint8_t *dest, int stride, int eob, int bd);
|
|
void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
|
|
uint8_t *dest, int stride, int eob, int bd);
|
|
static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
|
|
int bd) {
|
|
trans = WRAPLOW(trans, bd);
|
|
return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
|
|
}
|
|
#endif // CONFIG_VP9_HIGHBITDEPTH
|
|
#ifdef __cplusplus
|
|
} // extern "C"
|
|
#endif
|
|
|
|
#endif // VP9_COMMON_VP9_IDCT_H_
|