Merge "Factor inverse transform functions into vpx_dsp"

This commit is contained in:
Jingning Han 2015-08-01 16:20:24 +00:00 committed by Gerrit Code Review
commit b4c7d0523a
32 changed files with 6928 additions and 6849 deletions

View File

@ -14,8 +14,7 @@
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "test/acm_random.h"
#include "vpx/vpx_integer.h"

File diff suppressed because it is too large Load Diff

View File

@ -14,63 +14,16 @@
#include <assert.h>
#include "./vpx_config.h"
#include "vpx_dsp/txfm_common.h"
#if CONFIG_VP9_HIGHBITDEPTH
#include "vpx_dsp/vpx_dsp_common.h"
#endif // CONFIG_VP9_HIGHBITDEPTH
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_enums.h"
#include "vpx_dsp/inv_txfm.h"
#include "vpx_dsp/txfm_common.h"
#include "vpx_ports/mem.h"
#ifdef __cplusplus
extern "C" {
#endif
static INLINE tran_low_t check_range(tran_high_t input) {
#if CONFIG_COEFFICIENT_RANGE_CHECKING
// For valid VP9 input streams, intermediate stage coefficients should always
// stay within the range of a signed 16 bit integer. Coefficients can go out
// of this range for invalid/corrupt VP9 streams. However, strictly checking
// this range for every intermediate coefficient can burdensome for a decoder,
// therefore the following assertion is only enabled when configured with
// --enable-coefficient-range-checking.
assert(INT16_MIN <= input);
assert(input <= INT16_MAX);
#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
return (tran_low_t)input;
}
static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
return check_range(rv);
}
#if CONFIG_VP9_HIGHBITDEPTH
static INLINE tran_low_t highbd_check_range(tran_high_t input,
int bd) {
#if CONFIG_COEFFICIENT_RANGE_CHECKING
// For valid highbitdepth VP9 streams, intermediate stage coefficients will
// stay within the ranges:
// - 8 bit: signed 16 bit integer
// - 10 bit: signed 18 bit integer
// - 12 bit: signed 20 bit integer
const int32_t int_max = (1 << (7 + bd)) - 1;
const int32_t int_min = -int_max - 1;
assert(int_min <= input);
assert(input <= int_max);
(void) int_min;
#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
(void) bd;
return (tran_low_t)input;
}
static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
int bd) {
tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
return highbd_check_range(rv, bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
typedef void (*transform_1d)(const tran_low_t*, tran_low_t*);
typedef struct {
@ -85,28 +38,6 @@ typedef struct {
} highbd_transform_2d;
#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_EMULATE_HARDWARE
// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
// non-normative method to handle overflows. A stream that causes
// overflows in the inverse transform is considered invalid in VP9,
// and a hardware implementer is free to choose any reasonable
// method to handle overflows. However to aid in hardware
// verification they can use a specific implementation of the
// WRAPLOW() macro below that is identical to their intended
// hardware implementation (and also use configure options to trigger
// the C-implementation of the transform).
//
// The particular WRAPLOW implementation below performs strict
// overflow wrapping to match common hardware implementations.
// bd of 8 uses trans_low with 16bits, need to remove 16bits
// bd of 10 uses trans_low with 18bits, need to remove 14bits
// bd of 12 uses trans_low with 20bits, need to remove 12bits
// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
#else
#define WRAPLOW(x, bd) ((int32_t)(x))
#endif // CONFIG_EMULATE_HARDWARE
void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob);
void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
@ -126,9 +57,6 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
int stride, int eob);
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd);
void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd);
void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd);
void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob, int bd);
void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
@ -145,11 +73,6 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
uint8_t *dest, int stride, int eob, int bd);
void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
uint8_t *dest, int stride, int eob, int bd);
static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
int bd) {
trans = WRAPLOW(trans, bd);
return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#ifdef __cplusplus
} // extern "C"

View File

@ -87,39 +87,6 @@ specialize qw/vp9_filter_by_weight8x8 sse2 msa/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Note as optimized versions of these functions are added we need to add a check to ensure
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct4x4_1_add/;
add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct4x4_16_add/;
add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_1_add/;
add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_64_add/;
add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_12_add/;
add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_1_add/;
add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_256_add/;
add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_10_add/;
add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_1024_add/;
add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_34_add/;
add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_1_add/;
add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/vp9_iht4x4_16_add/;
@ -128,51 +95,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
specialize qw/vp9_iht16x16_256_add/;
# dct and add
add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_iwht4x4_1_add/;
add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_iwht4x4_16_add/;
} else {
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct4x4_1_add/;
add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct4x4_16_add/;
add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_1_add/;
add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_64_add/;
add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_12_add/;
add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_1_add/;
add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_256_add/;
add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_10_add/;
add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_1024_add/;
add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_34_add/;
add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_1_add/;
add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/vp9_iht4x4_16_add/;
@ -181,50 +106,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
specialize qw/vp9_iht16x16_256_add/;
# dct and add
add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_iwht4x4_1_add/;
add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_iwht4x4_16_add/;
} else {
add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct4x4_1_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct4x4_16_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_1_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_1_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_256_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_10_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_1024_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2 msa/;
#is this a typo?
$vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;
add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_1_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/vp9_iht4x4_16_add sse2 neon dspr2 msa/;
@ -233,14 +115,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
specialize qw/vp9_iht16x16_256_add sse2 dspr2 msa/;
# dct and add
add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_iwht4x4_1_add msa/;
add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_iwht4x4_16_add msa/, "$sse2_x86inc";
}
}
@ -295,24 +169,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
# Note as optimized versions of these functions are added we need to add a check to ensure
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
add_proto qw/void vp9_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct4x4_1_add/;
add_proto qw/void vp9_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct8x8_1_add/;
add_proto qw/void vp9_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct16x16_1_add/;
add_proto qw/void vp9_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct32x32_1024_add/;
add_proto qw/void vp9_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct32x32_34_add/;
add_proto qw/void vp9_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct32x32_1_add/;
add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
specialize qw/vp9_highbd_iht4x4_16_add/;
@ -321,50 +177,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
specialize qw/vp9_highbd_iht16x16_256_add/;
# dct and add
add_proto qw/void vp9_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_iwht4x4_1_add/;
add_proto qw/void vp9_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_iwht4x4_16_add/;
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct4x4_16_add/;
add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct8x8_64_add/;
add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct8x8_10_add/;
add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct16x16_256_add/;
add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct16x16_10_add/;
} else {
add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct4x4_16_add sse2/;
add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct8x8_64_add sse2/;
add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct8x8_10_add sse2/;
add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct16x16_256_add sse2/;
add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct16x16_10_add sse2/;
}
}
#

File diff suppressed because it is too large Load Diff

View File

@ -17,7 +17,7 @@
#include <tmmintrin.h> // SSSE3
#include "./vp9_rtcd.h"
#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,

View File

@ -66,7 +66,6 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm
ifeq ($(CONFIG_VP9_POSTPROC),yes)
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
@ -96,13 +95,6 @@ VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c
endif
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.h
ifeq ($(ARCH_X86_64), yes)
ifeq ($(CONFIG_USE_X86INC),yes)
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm
endif
endif
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM)
@ -111,30 +103,4 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
endif
# neon with assembly and intrinsics implementations. If both are available
# prefer assembly.
ifeq ($(HAVE_NEON_ASM), yes)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_1_add_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_add_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM)
else
ifeq ($(HAVE_NEON), yes)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_1_add_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_add_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon.c
endif # HAVE_NEON
endif # HAVE_NEON_ASM
$(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))

View File

@ -10,7 +10,7 @@
#include <arm_neon.h>
#include "vp9/common/vp9_idct.h"
#include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h"
void vp9_idct16x16_1_add_neon(

View File

@ -8,7 +8,6 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_common.h"
void vp9_idct16x16_256_add_neon_pass1(const int16_t *input,

View File

@ -12,7 +12,7 @@
#include "./vpx_config.h"
#include "vp9/common/vp9_idct.h"
#include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h"
static INLINE void LD_16x8(

View File

@ -10,7 +10,7 @@
#include <arm_neon.h>
#include "vp9/common/vp9_idct.h"
#include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h"
void vp9_idct4x4_1_add_neon(

View File

@ -10,7 +10,7 @@
#include <arm_neon.h>
#include "vp9/common/vp9_idct.h"
#include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h"
void vp9_idct8x8_1_add_neon(

2476
vpx_dsp/inv_txfm.c Normal file

File diff suppressed because it is too large Load Diff

124
vpx_dsp/inv_txfm.h Normal file
View File

@ -0,0 +1,124 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VPX_DSP_INV_TXFM_H_
#define VPX_DSP_INV_TXFM_H_
#include <assert.h>
#include "./vpx_config.h"
#include "vpx_dsp/txfm_common.h"
#include "vpx_ports/mem.h"
#ifdef __cplusplus
extern "C" {
#endif
static INLINE tran_low_t check_range(tran_high_t input) {
#if CONFIG_COEFFICIENT_RANGE_CHECKING
// For valid VP9 input streams, intermediate stage coefficients should always
// stay within the range of a signed 16 bit integer. Coefficients can go out
// of this range for invalid/corrupt VP9 streams. However, strictly checking
// this range for every intermediate coefficient can burdensome for a decoder,
// therefore the following assertion is only enabled when configured with
// --enable-coefficient-range-checking.
assert(INT16_MIN <= input);
assert(input <= INT16_MAX);
#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
return (tran_low_t)input;
}
static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
return check_range(rv);
}
#if CONFIG_VP9_HIGHBITDEPTH
static INLINE tran_low_t highbd_check_range(tran_high_t input,
int bd) {
#if CONFIG_COEFFICIENT_RANGE_CHECKING
// For valid highbitdepth VP9 streams, intermediate stage coefficients will
// stay within the ranges:
// - 8 bit: signed 16 bit integer
// - 10 bit: signed 18 bit integer
// - 12 bit: signed 20 bit integer
const int32_t int_max = (1 << (7 + bd)) - 1;
const int32_t int_min = -int_max - 1;
assert(int_min <= input);
assert(input <= int_max);
(void) int_min;
#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
(void) bd;
return (tran_low_t)input;
}
static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
int bd) {
tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
return highbd_check_range(rv, bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_EMULATE_HARDWARE
// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
// non-normative method to handle overflows. A stream that causes
// overflows in the inverse transform is considered invalid in VP9,
// and a hardware implementer is free to choose any reasonable
// method to handle overflows. However to aid in hardware
// verification they can use a specific implementation of the
// WRAPLOW() macro below that is identical to their intended
// hardware implementation (and also use configure options to trigger
// the C-implementation of the transform).
//
// The particular WRAPLOW implementation below performs strict
// overflow wrapping to match common hardware implementations.
// bd of 8 uses trans_low with 16bits, need to remove 16bits
// bd of 10 uses trans_low with 18bits, need to remove 14bits
// bd of 12 uses trans_low with 20bits, need to remove 12bits
// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
#else
#define WRAPLOW(x, bd) ((int32_t)(x))
#endif // CONFIG_EMULATE_HARDWARE
void idct4_c(const tran_low_t *input, tran_low_t *output);
void idct8_c(const tran_low_t *input, tran_low_t *output);
void idct16_c(const tran_low_t *input, tran_low_t *output);
void idct32_c(const tran_low_t *input, tran_low_t *output);
void iadst4_c(const tran_low_t *input, tran_low_t *output);
void iadst8_c(const tran_low_t *input, tran_low_t *output);
void iadst16_c(const tran_low_t *input, tran_low_t *output);
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
void vp9_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
void vp9_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd);
void highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
void highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
void highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
int bd) {
trans = WRAPLOW(trans, bd);
return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
}
#endif
static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
trans = WRAPLOW(trans, 8);
return clip_pixel(WRAPLOW(dest + trans, 8));
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VPX_DSP_INV_TXFM_H_

View File

@ -169,6 +169,43 @@ DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/fwd_dct32x32_msa.c
endif # CONFIG_VP9_ENCODER
# inverse transform
ifeq ($(CONFIG_VP9),yes)
DSP_SRCS-yes += inv_txfm.h
DSP_SRCS-yes += inv_txfm.c
DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h
DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.asm
ifeq ($(ARCH_X86_64),yes)
ifeq ($(CONFIG_USE_X86INC),yes)
DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3_x86_64.asm
endif # CONFIG_USE_X86INC
endif # ARCH_X86_64
ifeq ($(HAVE_NEON_ASM),yes)
DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM)
DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM)
DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM)
DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM)
DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM)
DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM)
DSP_SRCS-yes += arm/idct32x32_1_add_neon$(ASM)
DSP_SRCS-yes += arm/idct32x32_add_neon$(ASM)
else
ifeq ($(HAVE_NEON),yes)
DSP_SRCS-yes += arm/idct4x4_1_add_neon.c
DSP_SRCS-yes += arm/idct4x4_add_neon.c
DSP_SRCS-yes += arm/idct8x8_1_add_neon.c
DSP_SRCS-yes += arm/idct8x8_add_neon.c
DSP_SRCS-yes += arm/idct16x16_1_add_neon.c
DSP_SRCS-yes += arm/idct16x16_add_neon.c
DSP_SRCS-yes += arm/idct32x32_1_add_neon.c
DSP_SRCS-yes += arm/idct32x32_add_neon.c
endif # HAVE_NEON
endif # HAVE_NEON_ASM
DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c
endif # CONFIG_VP9
# quantization
ifeq ($(CONFIG_VP9_ENCODER),yes)
DSP_SRCS-yes += quantize.c

View File

@ -592,6 +592,193 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
} # CONFIG_VP9_HIGHBITDEPTH
} # CONFIG_VP9_ENCODER
#
# Inverse transform
if (vpx_config("CONFIG_VP9") eq "yes") {
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Note as optimized versions of these functions are added we need to add a check to ensure
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct4x4_1_add/;
add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct4x4_16_add/;
add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_1_add/;
add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_64_add/;
add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_12_add/;
add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_1_add/;
add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_256_add/;
add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_10_add/;
add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_1024_add/;
add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_34_add/;
add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_1_add/;
add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_iwht4x4_1_add/;
add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_iwht4x4_16_add/;
add_proto qw/void vp9_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct4x4_1_add/;
add_proto qw/void vp9_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct8x8_1_add/;
add_proto qw/void vp9_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct16x16_1_add/;
add_proto qw/void vp9_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct32x32_1024_add/;
add_proto qw/void vp9_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct32x32_34_add/;
add_proto qw/void vp9_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct32x32_1_add/;
add_proto qw/void vp9_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_iwht4x4_1_add/;
add_proto qw/void vp9_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_iwht4x4_16_add/;
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct4x4_16_add/;
add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct8x8_64_add/;
add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct8x8_10_add/;
add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct16x16_256_add/;
add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct16x16_10_add/;
} else {
add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct4x4_16_add sse2/;
add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct8x8_64_add sse2/;
add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct8x8_10_add sse2/;
add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct16x16_256_add sse2/;
add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct16x16_10_add sse2/;
} # CONFIG_EMULATE_HARDWARE
} else {
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct4x4_1_add/;
add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct4x4_16_add/;
add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_1_add/;
add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_64_add/;
add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_12_add/;
add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_1_add/;
add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_256_add/;
add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_10_add/;
add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_1024_add/;
add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_34_add/;
add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_1_add/;
add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_iwht4x4_1_add/;
add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_iwht4x4_16_add/;
} else {
add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct4x4_1_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct4x4_16_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_1_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_1_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_256_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_10_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_1024_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2 msa/;
# Need to add 34 eob idct32x32 neon implementation.
$vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;
add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_1_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_iwht4x4_1_add msa/;
add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_iwht4x4_16_add msa/, "$sse2_x86inc";
} # CONFIG_EMULATE_HARDWARE
} # CONFIG_VP9_HIGHBITDEPTH
} # CONFIG_VP9
#
# Quantization
#

4053
vpx_dsp/x86/inv_txfm_sse2.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -8,12 +8,13 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#ifndef VPX_DSP_X86_INV_TXFM_SSE2_H_
#define VPX_DSP_X86_INV_TXFM_SSE2_H_
#include <emmintrin.h> // SSE2
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_idct.h"
#include "vpx_dsp/inv_txfm.h"
// perform 8x8 transpose
static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
@ -172,3 +173,12 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
RECON_AND_STORE(dest + 14 * stride, in[14]);
RECON_AND_STORE(dest + 15 * stride, in[15]);
}
void idct4_sse2(__m128i *in);
void idct8_sse2(__m128i *in);
void idct16_sse2(__m128i *in0, __m128i *in1);
void iadst4_sse2(__m128i *in);
void iadst8_sse2(__m128i *in);
void iadst16_sse2(__m128i *in0, __m128i *in1);
#endif // VPX_DSP_X86_INV_TXFM_SSE2_H_