Optimize vp9_dc_only_idct_add_c function
Wrote SSE2 version of vp9_dc_only_idct_add_c function. In order to improve performance, clipped the absolute diff values to [0, 255]. This allowed us to keep the additions/subtractions in 8 bits. Test showed an over 2% decoder performance increase. Change-Id: Ie1a236d23d207e4ffcd1fc9f3d77462a9c7fe09d
This commit is contained in:
parent
9770d564f4
commit
35bc02c6eb
@ -13,6 +13,13 @@
|
|||||||
|
|
||||||
#include "./vpx_config.h"
|
#include "./vpx_config.h"
|
||||||
|
|
||||||
|
#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n))
|
||||||
|
|
||||||
|
/* If we don't want to use ROUND_POWER_OF_TWO macro
|
||||||
|
static INLINE int16_t round_power_of_two(int16_t value, int n) {
|
||||||
|
return (value + (1 << (n - 1))) >> n;
|
||||||
|
}*/
|
||||||
|
|
||||||
// Constants and Macros used by all idct/dct functions
|
// Constants and Macros used by all idct/dct functions
|
||||||
#define DCT_CONST_BITS 14
|
#define DCT_CONST_BITS 14
|
||||||
#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1))
|
#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1))
|
||||||
|
@ -31,13 +31,6 @@
|
|||||||
#include "vp9/common/vp9_common.h"
|
#include "vp9/common/vp9_common.h"
|
||||||
#include "vp9/common/vp9_idct.h"
|
#include "vp9/common/vp9_idct.h"
|
||||||
|
|
||||||
#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n))
|
|
||||||
|
|
||||||
/* If we don't want to use ROUND_POWER_OF_TWO macro
|
|
||||||
static INLINE int16_t round_power_of_two(int16_t value, int n) {
|
|
||||||
return (value + (1 << (n - 1))) >> n;
|
|
||||||
}*/
|
|
||||||
|
|
||||||
typedef void (*transform_1d)(int16_t*, int16_t*);
|
typedef void (*transform_1d)(int16_t*, int16_t*);
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
@ -296,7 +296,7 @@ specialize vp9_short_iht16x16
|
|||||||
# dct and add
|
# dct and add
|
||||||
|
|
||||||
prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
|
prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
|
||||||
specialize vp9_dc_only_idct_add
|
specialize vp9_dc_only_idct_add sse2
|
||||||
|
|
||||||
prototype void vp9_short_inv_walsh4x4_1_x8 "int16_t *input, int16_t *output, int pitch"
|
prototype void vp9_short_inv_walsh4x4_1_x8 "int16_t *input, int16_t *output, int pitch"
|
||||||
specialize vp9_short_inv_walsh4x4_1_x8
|
specialize vp9_short_inv_walsh4x4_1_x8
|
||||||
|
76
vp9/common/x86/vp9_idctllm_x86.c
Normal file
76
vp9/common/x86/vp9_idctllm_x86.c
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2012 The WebM project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
#include <emmintrin.h> // SSE2
|
||||||
|
#include "./vpx_config.h"
|
||||||
|
#include "vpx/vpx_integer.h"
|
||||||
|
#include "vp9/common/vp9_common.h"
|
||||||
|
#include "vp9/common/vp9_idct.h"
|
||||||
|
|
||||||
|
#if HAVE_SSE2
|
||||||
|
// In order to improve performance, clip absolute diff values to [0, 255],
|
||||||
|
// which allows to keep the additions/subtractions in 8 bits.
|
||||||
|
void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr,
|
||||||
|
uint8_t *dst_ptr, int pitch, int stride) {
|
||||||
|
int a1;
|
||||||
|
int16_t out;
|
||||||
|
uint8_t abs_diff;
|
||||||
|
__m128i p0, p1, p2, p3;
|
||||||
|
unsigned int extended_diff;
|
||||||
|
__m128i diff;
|
||||||
|
|
||||||
|
out = dct_const_round_shift(input_dc * cospi_16_64);
|
||||||
|
out = dct_const_round_shift(out * cospi_16_64);
|
||||||
|
a1 = ROUND_POWER_OF_TWO(out, 4);
|
||||||
|
|
||||||
|
// Read prediction data.
|
||||||
|
p0 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 0 * pitch));
|
||||||
|
p1 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 1 * pitch));
|
||||||
|
p2 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 2 * pitch));
|
||||||
|
p3 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 3 * pitch));
|
||||||
|
|
||||||
|
// Unpack prediction data, and store 4x4 array in 1 XMM register.
|
||||||
|
p0 = _mm_unpacklo_epi32(p0, p1);
|
||||||
|
p2 = _mm_unpacklo_epi32(p2, p3);
|
||||||
|
p0 = _mm_unpacklo_epi64(p0, p2);
|
||||||
|
|
||||||
|
// Clip dc value to [0, 255] range. Then, do addition or subtraction
|
||||||
|
// according to its sign.
|
||||||
|
if (a1 >= 0) {
|
||||||
|
abs_diff = (a1 > 255) ? 255 : a1;
|
||||||
|
extended_diff = abs_diff * 0x01010101u;
|
||||||
|
diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);
|
||||||
|
|
||||||
|
p1 = _mm_adds_epu8(p0, diff);
|
||||||
|
} else {
|
||||||
|
abs_diff = (a1 < -255) ? 255 : -a1;
|
||||||
|
extended_diff = abs_diff * 0x01010101u;
|
||||||
|
diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);
|
||||||
|
|
||||||
|
p1 = _mm_subs_epu8(p0, diff);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store results to dst.
|
||||||
|
*(int *)dst_ptr = _mm_cvtsi128_si32(p1);
|
||||||
|
dst_ptr += stride;
|
||||||
|
|
||||||
|
p1 = _mm_srli_si128(p1, 4);
|
||||||
|
*(int *)dst_ptr = _mm_cvtsi128_si32(p1);
|
||||||
|
dst_ptr += stride;
|
||||||
|
|
||||||
|
p1 = _mm_srli_si128(p1, 4);
|
||||||
|
*(int *)dst_ptr = _mm_cvtsi128_si32(p1);
|
||||||
|
dst_ptr += stride;
|
||||||
|
|
||||||
|
p1 = _mm_srli_si128(p1, 4);
|
||||||
|
*(int *)dst_ptr = _mm_cvtsi128_si32(p1);
|
||||||
|
}
|
||||||
|
#endif
|
@ -126,7 +126,7 @@ static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *xd) {
|
|||||||
xd->inv_txm4x4_1 = vp9_short_idct4x4llm_1;
|
xd->inv_txm4x4_1 = vp9_short_idct4x4llm_1;
|
||||||
xd->inv_txm4x4 = vp9_short_idct4x4llm;
|
xd->inv_txm4x4 = vp9_short_idct4x4llm;
|
||||||
xd->itxm_add = vp9_dequant_idct_add;
|
xd->itxm_add = vp9_dequant_idct_add;
|
||||||
xd->dc_only_itxm_add = vp9_dc_only_idct_add_c;
|
xd->dc_only_itxm_add = vp9_dc_only_idct_add;
|
||||||
xd->itxm_add_y_block = vp9_dequant_idct_add_y_block;
|
xd->itxm_add_y_block = vp9_dequant_idct_add_y_block;
|
||||||
xd->itxm_add_uv_block = vp9_dequant_idct_add_uv_block;
|
xd->itxm_add_uv_block = vp9_dequant_idct_add_uv_block;
|
||||||
if (xd->lossless) {
|
if (xd->lossless) {
|
||||||
|
@ -47,7 +47,7 @@ void vp9_dequant_idct_add_y_block_c(int16_t *q, const int16_t *dq,
|
|||||||
if (xd->block[i * 4 + j].eob > 1)
|
if (xd->block[i * 4 + j].eob > 1)
|
||||||
vp9_dequant_idct_add_c(q, dq, pre, dst, 16, stride);
|
vp9_dequant_idct_add_c(q, dq, pre, dst, 16, stride);
|
||||||
else {
|
else {
|
||||||
vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dst, 16, stride);
|
vp9_dc_only_idct_add(q[0]*dq[0], pre, dst, 16, stride);
|
||||||
((int *)q)[0] = 0;
|
((int *)q)[0] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -72,7 +72,7 @@ void vp9_dequant_idct_add_uv_block_c(int16_t *q, const int16_t *dq,
|
|||||||
if (xd->block[16 + i * 2 + j].eob > 1)
|
if (xd->block[16 + i * 2 + j].eob > 1)
|
||||||
vp9_dequant_idct_add_c(q, dq, pre, dstu, 8, stride);
|
vp9_dequant_idct_add_c(q, dq, pre, dstu, 8, stride);
|
||||||
else {
|
else {
|
||||||
vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstu, 8, stride);
|
vp9_dc_only_idct_add(q[0]*dq[0], pre, dstu, 8, stride);
|
||||||
((int *)q)[0] = 0;
|
((int *)q)[0] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -90,7 +90,7 @@ void vp9_dequant_idct_add_uv_block_c(int16_t *q, const int16_t *dq,
|
|||||||
if (xd->block[20 + i * 2 + j].eob > 1)
|
if (xd->block[20 + i * 2 + j].eob > 1)
|
||||||
vp9_dequant_idct_add_c(q, dq, pre, dstv, 8, stride);
|
vp9_dequant_idct_add_c(q, dq, pre, dstv, 8, stride);
|
||||||
else {
|
else {
|
||||||
vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstv, 8, stride);
|
vp9_dc_only_idct_add(q[0]*dq[0], pre, dstv, 8, stride);
|
||||||
((int *)q)[0] = 0;
|
((int *)q)[0] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -110,10 +110,13 @@ VP9_COMMON_SRCS-yes += common/vp9_maskingmv.c
|
|||||||
VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm
|
VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idctllm_x86.c
|
||||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_x86.c
|
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_x86.c
|
||||||
ifeq ($(HAVE_SSE2),yes)
|
ifeq ($(HAVE_SSE2),yes)
|
||||||
|
vp9/common/x86/vp9_idctllm_x86.c.o: CFLAGS += -msse2
|
||||||
vp9/common/x86/vp9_loopfilter_x86.c.o: CFLAGS += -msse2
|
vp9/common/x86/vp9_loopfilter_x86.c.o: CFLAGS += -msse2
|
||||||
vp9/common/x86/vp9_sadmxn_x86.c.o: CFLAGS += -msse2
|
vp9/common/x86/vp9_sadmxn_x86.c.o: CFLAGS += -msse2
|
||||||
|
vp9/common/x86/vp9_idctllm_x86.c.d: CFLAGS += -msse2
|
||||||
vp9/common/x86/vp9_loopfilter_x86.c.d: CFLAGS += -msse2
|
vp9/common/x86/vp9_loopfilter_x86.c.d: CFLAGS += -msse2
|
||||||
vp9/common/x86/vp9_sadmxn_x86.c.d: CFLAGS += -msse2
|
vp9/common/x86/vp9_sadmxn_x86.c.d: CFLAGS += -msse2
|
||||||
endif
|
endif
|
||||||
|
Loading…
x
Reference in New Issue
Block a user