Backports highbitdepth accelerations into vp10
Ports the changes in https://chromium-review.googlesource.com/#/c/302372/3 into vp10. Change-Id: I334c409f693691227ad16fc703c91899592dd8dc
This commit is contained in:
parent
cb5c47f20d
commit
f18322262f
@ -87,65 +87,127 @@ specialize qw/vp10_filter_by_weight8x8 sse2 msa/;
|
||||
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
# Note as optimized versions of these functions are added we need to add a check to ensure
|
||||
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
|
||||
add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/vp10_iht4x4_16_add/;
|
||||
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
|
||||
add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/vp10_iht4x4_16_add/;
|
||||
|
||||
add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/vp10_iht8x8_64_add/;
|
||||
add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/vp10_iht8x8_64_add/;
|
||||
|
||||
add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
|
||||
specialize qw/vp10_iht16x16_256_add/;
|
||||
add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
|
||||
specialize qw/vp10_iht16x16_256_add/;
|
||||
|
||||
add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct4x4 sse2/;
|
||||
add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct4x4/;
|
||||
|
||||
add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct4x4_1 sse2/;
|
||||
add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct4x4_1/;
|
||||
|
||||
add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct8x8 sse2/;
|
||||
add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct8x8/;
|
||||
|
||||
add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct8x8_1 sse2/;
|
||||
add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct8x8_1/;
|
||||
|
||||
add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct16x16 sse2/;
|
||||
add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct16x16/;
|
||||
|
||||
add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct16x16_1 sse2/;
|
||||
add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct16x16_1/;
|
||||
|
||||
add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct32x32 sse2/;
|
||||
add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct32x32/;
|
||||
|
||||
add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct32x32_rd sse2/;
|
||||
add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct32x32_rd/;
|
||||
|
||||
add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct32x32_1 sse2/;
|
||||
add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct32x32_1/;
|
||||
|
||||
add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct4x4 sse2/;
|
||||
add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct4x4/;
|
||||
|
||||
add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct8x8 sse2/;
|
||||
add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct8x8/;
|
||||
|
||||
add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct8x8_1/;
|
||||
add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct8x8_1/;
|
||||
|
||||
add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct16x16 sse2/;
|
||||
add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct16x16/;
|
||||
|
||||
add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct16x16_1/;
|
||||
add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct16x16_1/;
|
||||
|
||||
add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct32x32 sse2/;
|
||||
add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct32x32/;
|
||||
|
||||
add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct32x32_rd sse2/;
|
||||
add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct32x32_rd/;
|
||||
|
||||
add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct32x32_1/;
|
||||
add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct32x32_1/;
|
||||
} else {
|
||||
add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/vp10_iht4x4_16_add sse2/;
|
||||
|
||||
add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/vp10_iht8x8_64_add sse2/;
|
||||
|
||||
add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
|
||||
specialize qw/vp10_iht16x16_256_add/;
|
||||
|
||||
add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct4x4 sse2/;
|
||||
|
||||
add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct4x4_1 sse2/;
|
||||
|
||||
add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct8x8 sse2/;
|
||||
|
||||
add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct8x8_1 sse2/;
|
||||
|
||||
add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct16x16 sse2/;
|
||||
|
||||
add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct16x16_1 sse2/;
|
||||
|
||||
add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct32x32 sse2/;
|
||||
|
||||
add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct32x32_rd sse2/;
|
||||
|
||||
add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct32x32_1 sse2/;
|
||||
|
||||
add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct4x4 sse2/;
|
||||
|
||||
add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct8x8 sse2/;
|
||||
|
||||
add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct8x8_1/;
|
||||
|
||||
add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct16x16 sse2/;
|
||||
|
||||
add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct16x16_1/;
|
||||
|
||||
add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct32x32 sse2/;
|
||||
|
||||
add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct32x32_rd sse2/;
|
||||
|
||||
add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_highbd_fdct32x32_1/;
|
||||
}
|
||||
} else {
|
||||
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
|
||||
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
|
||||
|
@ -12,14 +12,14 @@
|
||||
#include "vpx_dsp/x86/txfm_common_sse2.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
void vp10_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
|
||||
int tx_type) {
|
||||
void vp10_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
|
||||
int tx_type) {
|
||||
__m128i in[2];
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i eight = _mm_set1_epi16(8);
|
||||
|
||||
in[0] = _mm_loadu_si128((const __m128i *)(input));
|
||||
in[1] = _mm_loadu_si128((const __m128i *)(input + 8));
|
||||
in[0] = load_input_data(input);
|
||||
in[1] = load_input_data(input + 8);
|
||||
|
||||
switch (tx_type) {
|
||||
case 0: // DCT_DCT
|
||||
@ -77,21 +77,21 @@ void vp10_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
|
||||
}
|
||||
}
|
||||
|
||||
void vp10_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
|
||||
int tx_type) {
|
||||
void vp10_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
|
||||
int tx_type) {
|
||||
__m128i in[8];
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i final_rounding = _mm_set1_epi16(1 << 4);
|
||||
|
||||
// load input data
|
||||
in[0] = _mm_load_si128((const __m128i *)input);
|
||||
in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
|
||||
in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
|
||||
in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
|
||||
in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
|
||||
in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
|
||||
in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
|
||||
in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
|
||||
in[0] = load_input_data(input);
|
||||
in[1] = load_input_data(input + 8 * 1);
|
||||
in[2] = load_input_data(input + 8 * 2);
|
||||
in[3] = load_input_data(input + 8 * 3);
|
||||
in[4] = load_input_data(input + 8 * 4);
|
||||
in[5] = load_input_data(input + 8 * 5);
|
||||
in[6] = load_input_data(input + 8 * 6);
|
||||
in[7] = load_input_data(input + 8 * 7);
|
||||
|
||||
switch (tx_type) {
|
||||
case 0: // DCT_DCT
|
||||
@ -144,8 +144,8 @@ void vp10_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
|
||||
RECON_AND_STORE(dest + 7 * stride, in[7]);
|
||||
}
|
||||
|
||||
void vp10_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
|
||||
int tx_type) {
|
||||
void vp10_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
|
||||
int stride, int tx_type) {
|
||||
__m128i in0[16], in1[16];
|
||||
|
||||
load_buffer_8x16(input, in0);
|
||||
|
Loading…
x
Reference in New Issue
Block a user