Merge "WIP: 4x4 idct/recon merge" into experimental
This commit is contained in:
commit
1db6373267
@ -96,11 +96,15 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) {
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
int16_t test_input_block[16];
|
||||
int16_t test_temp_block[16];
|
||||
int16_t test_output_block[16];
|
||||
uint8_t dst[16], src[16];
|
||||
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
src[j] = rnd.Rand8();
|
||||
dst[j] = rnd.Rand8();
|
||||
}
|
||||
// Initialize a test block with input range [-255, 255].
|
||||
for (int j = 0; j < 16; ++j)
|
||||
test_input_block[j] = rnd.Rand8() - rnd.Rand8();
|
||||
test_input_block[j] = src[j] - dst[j];
|
||||
|
||||
// TODO(Yaowu): this should be converted to a parameterized test
|
||||
// to test optimized versions of this function.
|
||||
@ -120,10 +124,10 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) {
|
||||
}
|
||||
|
||||
// Because the bitstream is not frozen yet, use the idct in the codebase.
|
||||
vp9_short_idct4x4_c(test_temp_block, test_output_block, pitch);
|
||||
vp9_short_idct4x4_add_c(test_temp_block, dst, 4);
|
||||
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
const int diff = test_input_block[j] - test_output_block[j];
|
||||
const int diff = dst[j] - src[j];
|
||||
const int error = diff * diff;
|
||||
if (max_error < error)
|
||||
max_error = error;
|
||||
|
@ -391,8 +391,8 @@ typedef struct macroblockd {
|
||||
|
||||
int lossless;
|
||||
/* Inverse transform function pointers. */
|
||||
void (*inv_txm4x4_1)(int16_t *input, int16_t *output, int pitch);
|
||||
void (*inv_txm4x4)(int16_t *input, int16_t *output, int pitch);
|
||||
void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride);
|
||||
void (*inv_txm4x4_add)(int16_t *input, uint8_t *dest, int stride);
|
||||
void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob);
|
||||
void (*itxm_add_y_block)(int16_t *q, uint8_t *dst, int stride,
|
||||
struct macroblockd *xd);
|
||||
|
@ -18,12 +18,12 @@
|
||||
#include "vp9/common/vp9_common.h"
|
||||
#include "vp9/common/vp9_idct.h"
|
||||
|
||||
void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) {
|
||||
void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
|
||||
int i;
|
||||
int16_t output[16];
|
||||
int a1, b1, c1, d1;
|
||||
int16_t *ip = input;
|
||||
int16_t *op = output;
|
||||
const int half_pitch = pitch >> 1;
|
||||
|
||||
for (i = 0; i < 4; i++) {
|
||||
a1 = (ip[0] + ip[3]) >> WHT_UPSCALE_FACTOR;
|
||||
@ -37,63 +37,60 @@ void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) {
|
||||
op[3] = (d1 - c1) >> 1;
|
||||
|
||||
ip += 4;
|
||||
op += half_pitch;
|
||||
op += 4;
|
||||
}
|
||||
|
||||
ip = output;
|
||||
op = output;
|
||||
for (i = 0; i < 4; i++) {
|
||||
a1 = ip[half_pitch * 0] + ip[half_pitch * 3];
|
||||
b1 = ip[half_pitch * 1] + ip[half_pitch * 2];
|
||||
c1 = ip[half_pitch * 1] - ip[half_pitch * 2];
|
||||
d1 = ip[half_pitch * 0] - ip[half_pitch * 3];
|
||||
a1 = ip[4 * 0] + ip[4 * 3];
|
||||
b1 = ip[4 * 1] + ip[4 * 2];
|
||||
c1 = ip[4 * 1] - ip[4 * 2];
|
||||
d1 = ip[4 * 0] - ip[4 * 3];
|
||||
|
||||
|
||||
op[half_pitch * 0] = (a1 + b1 + 1) >> 1;
|
||||
op[half_pitch * 1] = (c1 + d1) >> 1;
|
||||
op[half_pitch * 2] = (a1 - b1) >> 1;
|
||||
op[half_pitch * 3] = (d1 - c1) >> 1;
|
||||
dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] +
|
||||
((a1 + b1 + 1) >> 1));
|
||||
dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] +
|
||||
((c1 + d1) >> 1));
|
||||
dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] +
|
||||
((a1 - b1) >> 1));
|
||||
dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] +
|
||||
((d1 - c1) >> 1));
|
||||
|
||||
ip++;
|
||||
op++;
|
||||
dest++;
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_short_iwalsh4x4_1_c(int16_t *in, int16_t *out, int pitch) {
|
||||
void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {
|
||||
int i;
|
||||
int16_t tmp[4];
|
||||
int16_t *ip = in;
|
||||
int16_t *op = tmp;
|
||||
const int half_pitch = pitch >> 1;
|
||||
|
||||
op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;
|
||||
op[1] = op[2] = op[3] = (ip[0] >> WHT_UPSCALE_FACTOR) >> 1;
|
||||
|
||||
ip = tmp;
|
||||
op = out;
|
||||
for (i = 0; i < 4; i++) {
|
||||
op[half_pitch * 0] = (ip[0] + 1) >> 1;
|
||||
op[half_pitch * 1] = op[half_pitch * 2] = op[half_pitch * 3] = ip[0] >> 1;
|
||||
dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] +
|
||||
((ip[0] + 1) >> 1));
|
||||
dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] +
|
||||
(ip[0] >> 1));
|
||||
dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] +
|
||||
(ip[0] >> 1));
|
||||
dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] +
|
||||
(ip[0] >> 1));
|
||||
ip++;
|
||||
op++;
|
||||
dest++;
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr,
|
||||
uint8_t *dst_ptr,
|
||||
int pitch, int stride) {
|
||||
int r, c;
|
||||
int16_t dc = input_dc;
|
||||
int16_t tmp[4 * 4];
|
||||
vp9_short_iwalsh4x4_1_c(&dc, tmp, 4 << 1);
|
||||
|
||||
for (r = 0; r < 4; r++) {
|
||||
for (c = 0; c < 4; c++)
|
||||
dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]);
|
||||
|
||||
dst_ptr += stride;
|
||||
pred_ptr += pitch;
|
||||
}
|
||||
vp9_short_iwalsh4x4_1_add_c(&dc, dst_ptr, stride);
|
||||
}
|
||||
|
||||
void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
|
||||
@ -116,10 +113,9 @@ void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
|
||||
output[3] = step[0] - step[3];
|
||||
}
|
||||
|
||||
void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) {
|
||||
void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
|
||||
int16_t out[4 * 4];
|
||||
int16_t *outptr = out;
|
||||
const int half_pitch = pitch >> 1;
|
||||
int i, j;
|
||||
int16_t temp_in[4], temp_out[4];
|
||||
|
||||
@ -138,22 +134,24 @@ void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) {
|
||||
temp_in[j] = out[j * 4 + i];
|
||||
vp9_idct4_1d(temp_in, temp_out);
|
||||
for (j = 0; j < 4; ++j)
|
||||
output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
|
||||
dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
|
||||
+ dest[j * dest_stride + i]);
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_short_idct4x4_1_c(int16_t *input, int16_t *output, int pitch) {
|
||||
void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
|
||||
int i;
|
||||
int a1;
|
||||
int16_t *op = output;
|
||||
const int half_pitch = pitch >> 1;
|
||||
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
|
||||
out = dct_const_round_shift(out * cospi_16_64);
|
||||
a1 = ROUND_POWER_OF_TWO(out, 4);
|
||||
|
||||
for (i = 0; i < 4; i++) {
|
||||
op[0] = op[1] = op[2] = op[3] = a1;
|
||||
op += half_pitch;
|
||||
dest[0] = clip_pixel(dest[0] + a1);
|
||||
dest[1] = clip_pixel(dest[1] + a1);
|
||||
dest[2] = clip_pixel(dest[2] + a1);
|
||||
dest[3] = clip_pixel(dest[3] + a1);
|
||||
dest += dest_stride;
|
||||
}
|
||||
}
|
||||
|
||||
@ -285,8 +283,8 @@ static void iadst4_1d(int16_t *input, int16_t *output) {
|
||||
output[3] = dct_const_round_shift(s3);
|
||||
}
|
||||
|
||||
void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
|
||||
int pitch, int tx_type) {
|
||||
void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride,
|
||||
int tx_type) {
|
||||
const transform_2d IHT_4[] = {
|
||||
{ vp9_idct4_1d, vp9_idct4_1d }, // DCT_DCT = 0
|
||||
{ iadst4_1d, vp9_idct4_1d }, // ADST_DCT = 1
|
||||
@ -312,10 +310,10 @@ void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
|
||||
temp_in[j] = out[j * 4 + i];
|
||||
IHT_4[tx_type].cols(temp_in, temp_out);
|
||||
for (j = 0; j < 4; ++j)
|
||||
output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
|
||||
dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
|
||||
+ dest[j * dest_stride + i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void iadst8_1d(int16_t *input, int16_t *output) {
|
||||
int s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
|
||||
|
@ -11,11 +11,10 @@
|
||||
#include "vp9/common/vp9_invtrans.h"
|
||||
#include "./vp9_rtcd.h"
|
||||
|
||||
void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,
|
||||
int16_t *dqcoeff, int16_t *diff,
|
||||
int pitch) {
|
||||
void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff,
|
||||
uint8_t *dest, int stride) {
|
||||
if (eob <= 1)
|
||||
xd->inv_txm4x4_1(dqcoeff, diff, pitch);
|
||||
xd->inv_txm4x4_1_add(dqcoeff, dest, stride);
|
||||
else
|
||||
xd->inv_txm4x4(dqcoeff, diff, pitch);
|
||||
xd->inv_txm4x4_add(dqcoeff, dest, stride);
|
||||
}
|
||||
|
@ -15,7 +15,6 @@
|
||||
#include "vpx/vpx_integer.h"
|
||||
#include "vp9/common/vp9_blockd.h"
|
||||
|
||||
void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,
|
||||
int16_t *dqcoeff, int16_t *diff,
|
||||
int pitch);
|
||||
void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff,
|
||||
uint8_t *dest, int stride);
|
||||
#endif // VP9_COMMON_VP9_INVTRANS_H_
|
||||
|
@ -85,9 +85,6 @@ prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, enum BLO
|
||||
specialize vp9_intra4x4_predict;
|
||||
|
||||
if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
|
||||
prototype void vp9_add_residual_4x4 "const int16_t *diff, uint8_t *dest, int stride"
|
||||
specialize vp9_add_residual_4x4 sse2
|
||||
|
||||
prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
|
||||
specialize vp9_add_constant_residual_8x8 sse2
|
||||
|
||||
@ -179,11 +176,11 @@ specialize vp9_convolve8_avg_vert ssse3
|
||||
#
|
||||
# dct
|
||||
#
|
||||
prototype void vp9_short_idct4x4_1 "int16_t *input, int16_t *output, int pitch"
|
||||
specialize vp9_short_idct4x4_1
|
||||
prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
|
||||
specialize vp9_short_idct4x4_1_add
|
||||
|
||||
prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch"
|
||||
specialize vp9_short_idct4x4 sse2
|
||||
prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
|
||||
specialize vp9_short_idct4x4_add sse2
|
||||
|
||||
prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
|
||||
specialize vp9_short_idct8x8_add sse2
|
||||
@ -212,12 +209,12 @@ specialize vp9_short_idct1_32x32
|
||||
prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
|
||||
specialize vp9_short_idct10_32x32_add
|
||||
|
||||
prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
|
||||
specialize vp9_short_iht4x4_add
|
||||
|
||||
prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
|
||||
specialize vp9_short_iht8x8_add
|
||||
|
||||
prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
|
||||
specialize vp9_short_iht4x4
|
||||
|
||||
prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
|
||||
specialize vp9_short_iht16x16_add
|
||||
|
||||
@ -229,12 +226,11 @@ specialize vp9_idct4_1d sse2
|
||||
prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
|
||||
specialize vp9_dc_only_idct_add sse2
|
||||
|
||||
prototype void vp9_short_iwalsh4x4_1 "int16_t *input, int16_t *output, int pitch"
|
||||
specialize vp9_short_iwalsh4x4_1
|
||||
prototype void vp9_short_iwalsh4x4 "int16_t *input, int16_t *output, int pitch"
|
||||
specialize vp9_short_iwalsh4x4
|
||||
prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
|
||||
specialize vp9_dc_only_inv_walsh_add
|
||||
prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
|
||||
specialize vp9_short_iwalsh4x4_1_add
|
||||
|
||||
prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
|
||||
specialize vp9_short_iwalsh4x4_add
|
||||
|
||||
prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"
|
||||
specialize vp9_sad32x3
|
||||
|
@ -73,7 +73,7 @@ void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr,
|
||||
*(int *)dst_ptr = _mm_cvtsi128_si32(p1);
|
||||
}
|
||||
|
||||
void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
|
||||
void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i eight = _mm_set1_epi16(8);
|
||||
const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
|
||||
@ -81,7 +81,6 @@ void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
|
||||
(int16_t)cospi_24_64, (int16_t)-cospi_8_64,
|
||||
(int16_t)cospi_8_64, (int16_t)cospi_24_64);
|
||||
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
|
||||
const int half_pitch = pitch >> 1;
|
||||
__m128i input0, input1, input2, input3;
|
||||
|
||||
// Rows
|
||||
@ -188,14 +187,23 @@ void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
|
||||
input2 = _mm_srai_epi16(input2, 4);
|
||||
input3 = _mm_srai_epi16(input3, 4);
|
||||
|
||||
// Store results
|
||||
_mm_storel_epi64((__m128i *)output, input2);
|
||||
input2 = _mm_srli_si128(input2, 8);
|
||||
_mm_storel_epi64((__m128i *)(output + half_pitch), input2);
|
||||
#define RECON_AND_STORE4X4(dest, in_x) \
|
||||
{ \
|
||||
__m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
|
||||
d0 = _mm_unpacklo_epi8(d0, zero); \
|
||||
d0 = _mm_add_epi16(in_x, d0); \
|
||||
d0 = _mm_packus_epi16(d0, d0); \
|
||||
*(int *)dest = _mm_cvtsi128_si32(d0); \
|
||||
dest += stride; \
|
||||
}
|
||||
|
||||
_mm_storel_epi64((__m128i *)(output + 3 * half_pitch), input3);
|
||||
input3 = _mm_srli_si128(input3, 8);
|
||||
_mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3);
|
||||
input0 = _mm_srli_si128(input2, 8);
|
||||
input1 = _mm_srli_si128(input3, 8);
|
||||
|
||||
RECON_AND_STORE4X4(dest, input2);
|
||||
RECON_AND_STORE4X4(dest, input0);
|
||||
RECON_AND_STORE4X4(dest, input1);
|
||||
RECON_AND_STORE4X4(dest, input3);
|
||||
}
|
||||
|
||||
void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
|
||||
|
@ -1006,14 +1006,10 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
|
||||
pc->uv_dc_delta_q == 0 &&
|
||||
pc->uv_ac_delta_q == 0;
|
||||
if (xd->lossless) {
|
||||
xd->inv_txm4x4_1 = vp9_short_iwalsh4x4_1;
|
||||
xd->inv_txm4x4 = vp9_short_iwalsh4x4;
|
||||
xd->itxm_add = vp9_idct_add_lossless_c;
|
||||
xd->itxm_add_y_block = vp9_idct_add_y_block_lossless_c;
|
||||
xd->itxm_add_uv_block = vp9_idct_add_uv_block_lossless_c;
|
||||
} else {
|
||||
xd->inv_txm4x4_1 = vp9_short_idct4x4_1;
|
||||
xd->inv_txm4x4 = vp9_short_idct4x4;
|
||||
xd->itxm_add = vp9_idct_add;
|
||||
xd->itxm_add_y_block = vp9_idct_add_y_block;
|
||||
xd->itxm_add_uv_block = vp9_idct_add_uv_block;
|
||||
|
@ -84,23 +84,6 @@ void vp9_idct_add_uv_block_lossless_c(int16_t *q, uint8_t *dst, int stride,
|
||||
}
|
||||
}
|
||||
|
||||
static void add_residual(const int16_t *diff, uint8_t *dest, int stride,
|
||||
int width, int height) {
|
||||
int r, c;
|
||||
|
||||
for (r = 0; r < height; r++) {
|
||||
for (c = 0; c < width; c++)
|
||||
dest[c] = clip_pixel(diff[c] + dest[c]);
|
||||
|
||||
dest += stride;
|
||||
diff += width;
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_add_residual_4x4_c(const int16_t *diff, uint8_t *dest, int stride) {
|
||||
add_residual(diff, dest, stride, 4, 4);
|
||||
}
|
||||
|
||||
static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
|
||||
int width, int height) {
|
||||
int r, c;
|
||||
@ -133,11 +116,8 @@ void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride,
|
||||
if (tx_type == DCT_DCT) {
|
||||
vp9_idct_add(input, dest, stride, eob);
|
||||
} else {
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
|
||||
|
||||
vp9_short_iht4x4(input, output, 4, tx_type);
|
||||
vp9_short_iht4x4_add(input, dest, stride, tx_type);
|
||||
vpx_memset(input, 0, 32);
|
||||
vp9_add_residual_4x4(output, dest, stride);
|
||||
}
|
||||
}
|
||||
|
||||
@ -154,13 +134,9 @@ void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
|
||||
}
|
||||
|
||||
void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
|
||||
|
||||
if (eob > 1) {
|
||||
// the idct halves ( >> 1) the pitch
|
||||
vp9_short_idct4x4(input, output, 4 << 1);
|
||||
vp9_short_idct4x4_add(input, dest, stride);
|
||||
vpx_memset(input, 0, 32);
|
||||
vp9_add_residual_4x4(output, dest, stride);
|
||||
} else {
|
||||
vp9_dc_only_idct_add(input[0], dest, dest, stride, stride);
|
||||
((int *)input)[0] = 0;
|
||||
@ -168,38 +144,27 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {
|
||||
}
|
||||
|
||||
void vp9_dc_idct_add_c(int16_t *input, uint8_t *dest, int stride, int dc) {
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
|
||||
|
||||
input[0] = dc;
|
||||
|
||||
// the idct halves ( >> 1) the pitch
|
||||
vp9_short_idct4x4(input, output, 4 << 1);
|
||||
vp9_short_idct4x4_add(input, dest, stride);
|
||||
vpx_memset(input, 0, 32);
|
||||
vp9_add_residual_4x4(output, dest, stride);
|
||||
}
|
||||
|
||||
void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest, int stride,
|
||||
int eob) {
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
|
||||
|
||||
if (eob > 1) {
|
||||
vp9_short_iwalsh4x4_c(input, output, 4 << 1);
|
||||
vp9_short_iwalsh4x4_add(input, dest, stride);
|
||||
vpx_memset(input, 0, 32);
|
||||
vp9_add_residual_4x4(output, dest, stride);
|
||||
} else {
|
||||
vp9_dc_only_inv_walsh_add(input[0], dest, dest, stride, stride);
|
||||
vp9_short_iwalsh4x4_1_add_c(input, dest, stride);
|
||||
((int *)input)[0] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_dc_idct_add_lossless_c(int16_t *input, uint8_t *dest,
|
||||
int stride, int dc) {
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
|
||||
|
||||
input[0] = dc;
|
||||
vp9_short_iwalsh4x4_c(input, output, 4 << 1);
|
||||
vp9_short_iwalsh4x4_add(input, dest, stride);
|
||||
vpx_memset(input, 0, 32);
|
||||
vp9_add_residual_4x4(output, dest, stride);
|
||||
}
|
||||
|
||||
void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
|
||||
|
@ -15,49 +15,6 @@
|
||||
#include "vp9/common/vp9_common.h"
|
||||
#include "vp9/common/vp9_idct.h"
|
||||
|
||||
void vp9_add_residual_4x4_sse2(const int16_t *diff, uint8_t *dest, int stride) {
|
||||
const int width = 4;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
||||
// Diff data
|
||||
const __m128i d0 = _mm_loadl_epi64((const __m128i *)(diff + 0 * width));
|
||||
const __m128i d1 = _mm_loadl_epi64((const __m128i *)(diff + 1 * width));
|
||||
const __m128i d2 = _mm_loadl_epi64((const __m128i *)(diff + 2 * width));
|
||||
const __m128i d3 = _mm_loadl_epi64((const __m128i *)(diff + 3 * width));
|
||||
|
||||
// Prediction data.
|
||||
__m128i p0 = _mm_cvtsi32_si128(*(const int *)(dest + 0 * stride));
|
||||
__m128i p1 = _mm_cvtsi32_si128(*(const int *)(dest + 1 * stride));
|
||||
__m128i p2 = _mm_cvtsi32_si128(*(const int *)(dest + 2 * stride));
|
||||
__m128i p3 = _mm_cvtsi32_si128(*(const int *)(dest + 3 * stride));
|
||||
|
||||
p0 = _mm_unpacklo_epi8(p0, zero);
|
||||
p1 = _mm_unpacklo_epi8(p1, zero);
|
||||
p2 = _mm_unpacklo_epi8(p2, zero);
|
||||
p3 = _mm_unpacklo_epi8(p3, zero);
|
||||
|
||||
p0 = _mm_add_epi16(p0, d0);
|
||||
p1 = _mm_add_epi16(p1, d1);
|
||||
p2 = _mm_add_epi16(p2, d2);
|
||||
p3 = _mm_add_epi16(p3, d3);
|
||||
|
||||
p0 = _mm_packus_epi16(p0, p1);
|
||||
p2 = _mm_packus_epi16(p2, p3);
|
||||
|
||||
*(int *)dest = _mm_cvtsi128_si32(p0);
|
||||
dest += stride;
|
||||
|
||||
p0 = _mm_srli_si128(p0, 8);
|
||||
*(int *)dest = _mm_cvtsi128_si32(p0);
|
||||
dest += stride;
|
||||
|
||||
*(int *)dest = _mm_cvtsi128_si32(p2);
|
||||
dest += stride;
|
||||
|
||||
p2 = _mm_srli_si128(p2, 8);
|
||||
*(int *)dest = _mm_cvtsi128_si32(p2);
|
||||
}
|
||||
|
||||
void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
|
||||
int stride) {
|
||||
uint8_t abs_diff;
|
||||
|
@ -1169,8 +1169,8 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
|
||||
if (lossless) {
|
||||
cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4;
|
||||
cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
|
||||
cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_iwalsh4x4_1;
|
||||
cpi->mb.e_mbd.inv_txm4x4 = vp9_short_iwalsh4x4;
|
||||
cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;
|
||||
cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add;
|
||||
cpi->mb.optimize = 0;
|
||||
cpi->common.filter_level = 0;
|
||||
cpi->zbin_mode_boost_enabled = 0;
|
||||
@ -1178,8 +1178,8 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
|
||||
} else {
|
||||
cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4;
|
||||
cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
|
||||
cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_idct4x4_1;
|
||||
cpi->mb.e_mbd.inv_txm4x4 = vp9_short_idct4x4;
|
||||
cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add;
|
||||
cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -53,9 +53,6 @@ static void encode_intra4x4block(MACROBLOCK *x, int ib,
|
||||
int16_t* const src_diff =
|
||||
raster_block_offset_int16(xd, bsize, 0, ib,
|
||||
x->plane[0].src_diff);
|
||||
int16_t* const diff =
|
||||
raster_block_offset_int16(xd, bsize, 0, ib,
|
||||
xd->plane[0].diff);
|
||||
int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
|
||||
const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
|
||||
|
||||
@ -72,17 +69,15 @@ static void encode_intra4x4block(MACROBLOCK *x, int ib,
|
||||
if (tx_type != DCT_DCT) {
|
||||
vp9_short_fht4x4(src_diff, coeff, 4 << bwl, tx_type);
|
||||
x->quantize_b_4x4(x, ib, tx_type, 16);
|
||||
vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
|
||||
diff, 4 << bwl, tx_type);
|
||||
vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16), dst,
|
||||
xd->plane[0].dst.stride, tx_type);
|
||||
} else {
|
||||
x->fwd_txm4x4(src_diff, coeff, 8 << bwl);
|
||||
x->quantize_b_4x4(x, ib, tx_type, 16);
|
||||
vp9_inverse_transform_b_4x4(&x->e_mbd, xd->plane[0].eobs[ib],
|
||||
vp9_inverse_transform_b_4x4_add(&x->e_mbd, xd->plane[0].eobs[ib],
|
||||
BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
|
||||
diff, 8 << bwl);
|
||||
dst, xd->plane[0].dst.stride);
|
||||
}
|
||||
|
||||
vp9_recon_b(dst, diff, 4 << bwl, dst, xd->plane[0].dst.stride);
|
||||
}
|
||||
|
||||
void vp9_encode_intra4x4mby(MACROBLOCK *mb, BLOCK_SIZE_TYPE bsize) {
|
||||
|
@ -425,7 +425,6 @@ struct encode_b_args {
|
||||
VP9_COMMON *cm;
|
||||
MACROBLOCK *x;
|
||||
struct optimize_ctx *ctx;
|
||||
int *wip_txfrm_size; // for "work in progress" only... will remove once done
|
||||
};
|
||||
|
||||
static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
|
||||
@ -494,14 +493,9 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
|
||||
int ss_txfrm_size, void *arg) {
|
||||
struct encode_b_args* const args = arg;
|
||||
MACROBLOCK* const x = args->x;
|
||||
int *wip_txfrm_size = args->wip_txfrm_size;
|
||||
MACROBLOCKD* const xd = &x->e_mbd;
|
||||
const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
|
||||
const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
|
||||
block, ss_txfrm_size);
|
||||
int16_t* const diff = raster_block_offset_int16(xd, bsize, plane,
|
||||
raster_block,
|
||||
xd->plane[plane].diff);
|
||||
uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,
|
||||
raster_block,
|
||||
xd->plane[plane].dst.buf,
|
||||
@ -517,7 +511,6 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
|
||||
case TX_32X32:
|
||||
vp9_short_idct32x32_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
|
||||
block, 16), dst, xd->plane[plane].dst.stride);
|
||||
*wip_txfrm_size = 32;
|
||||
break;
|
||||
case TX_16X16:
|
||||
tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
|
||||
@ -529,7 +522,6 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
|
||||
block, 16), dst, xd->plane[plane].dst.stride,
|
||||
tx_type);
|
||||
}
|
||||
*wip_txfrm_size = 16;
|
||||
break;
|
||||
case TX_8X8:
|
||||
tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
|
||||
@ -541,7 +533,6 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
|
||||
block, 16), dst, xd->plane[plane].dst.stride,
|
||||
tx_type);
|
||||
}
|
||||
*wip_txfrm_size = 8;
|
||||
break;
|
||||
case TX_4X4:
|
||||
tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
|
||||
@ -549,13 +540,13 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
|
||||
// this is like vp9_short_idct4x4 but has a special case around eob<=1
|
||||
// which is significant (not just an optimization) for the lossless
|
||||
// case.
|
||||
vp9_inverse_transform_b_4x4(xd, xd->plane[plane].eobs[block],
|
||||
BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), diff, bw * 2);
|
||||
vp9_inverse_transform_b_4x4_add(xd, xd->plane[plane].eobs[block],
|
||||
BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), dst,
|
||||
xd->plane[plane].dst.stride);
|
||||
} else {
|
||||
vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
|
||||
diff, bw, tx_type);
|
||||
vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
|
||||
dst, xd->plane[plane].dst.stride, tx_type);
|
||||
}
|
||||
*wip_txfrm_size = 4;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -563,16 +554,15 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
|
||||
void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
|
||||
BLOCK_SIZE_TYPE bsize) {
|
||||
MACROBLOCKD* const xd = &x->e_mbd;
|
||||
struct encode_b_args arg = {cm, x, NULL, NULL};
|
||||
struct encode_b_args arg = {cm, x, NULL};
|
||||
|
||||
foreach_transformed_block_in_plane(xd, bsize, 0,
|
||||
xform_quant, &arg);
|
||||
foreach_transformed_block_in_plane(xd, bsize, 0, xform_quant, &arg);
|
||||
}
|
||||
|
||||
void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
|
||||
BLOCK_SIZE_TYPE bsize) {
|
||||
MACROBLOCKD* const xd = &x->e_mbd;
|
||||
struct encode_b_args arg = {cm, x, NULL, NULL};
|
||||
struct encode_b_args arg = {cm, x, NULL};
|
||||
|
||||
foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);
|
||||
}
|
||||
@ -581,61 +571,37 @@ void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x,
|
||||
BLOCK_SIZE_TYPE bsize) {
|
||||
MACROBLOCKD* const xd = &x->e_mbd;
|
||||
struct optimize_ctx ctx;
|
||||
int wip_txfrm_size = 0;
|
||||
struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
|
||||
struct encode_b_args arg = {cm, x, &ctx};
|
||||
|
||||
vp9_subtract_sby(x, bsize);
|
||||
if (x->optimize)
|
||||
vp9_optimize_init(xd, bsize, &ctx);
|
||||
|
||||
foreach_transformed_block_in_plane(xd, bsize, 0,
|
||||
encode_block, &arg);
|
||||
if (wip_txfrm_size < 8)
|
||||
vp9_recon_sby(xd, bsize);
|
||||
foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg);
|
||||
}
|
||||
|
||||
void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
|
||||
BLOCK_SIZE_TYPE bsize) {
|
||||
MACROBLOCKD* const xd = &x->e_mbd;
|
||||
struct optimize_ctx ctx;
|
||||
int wip_txfrm_size = 0;
|
||||
struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
|
||||
struct encode_b_args arg = {cm, x, &ctx};
|
||||
|
||||
vp9_subtract_sbuv(x, bsize);
|
||||
if (x->optimize)
|
||||
vp9_optimize_init(xd, bsize, &ctx);
|
||||
|
||||
foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
|
||||
|
||||
if (wip_txfrm_size < 8)
|
||||
vp9_recon_sbuv(xd, bsize);
|
||||
}
|
||||
|
||||
void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
|
||||
BLOCK_SIZE_TYPE bsize) {
|
||||
MACROBLOCKD* const xd = &x->e_mbd;
|
||||
struct optimize_ctx ctx;
|
||||
int wip_txfrm_size = 0;
|
||||
struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
|
||||
struct encode_b_args arg = {cm, x, &ctx};
|
||||
|
||||
vp9_subtract_sb(x, bsize);
|
||||
if (x->optimize)
|
||||
vp9_optimize_init(xd, bsize, &ctx);
|
||||
#if 0
|
||||
|
||||
foreach_transformed_block(xd, bsize, encode_block, &arg);
|
||||
|
||||
vp9_recon_sb(xd, bsize);
|
||||
#else
|
||||
// wip version... will use foreach_transformed_block when done
|
||||
foreach_transformed_block_in_plane(xd, bsize, 0,
|
||||
encode_block, &arg);
|
||||
if (wip_txfrm_size < 8)
|
||||
vp9_recon_sby(xd, bsize);
|
||||
wip_txfrm_size = 0;
|
||||
|
||||
foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
|
||||
|
||||
if (wip_txfrm_size < 8)
|
||||
vp9_recon_sbuv(xd, bsize);
|
||||
#endif
|
||||
}
|
||||
|
@ -1178,11 +1178,11 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
|
||||
|
||||
cpi->oxcf.lossless = oxcf->lossless;
|
||||
if (cpi->oxcf.lossless) {
|
||||
cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_iwalsh4x4_1;
|
||||
cpi->mb.e_mbd.inv_txm4x4 = vp9_short_iwalsh4x4;
|
||||
cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;
|
||||
cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add;
|
||||
} else {
|
||||
cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_idct4x4_1;
|
||||
cpi->mb.e_mbd.inv_txm4x4 = vp9_short_idct4x4;
|
||||
cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add;
|
||||
cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add;
|
||||
}
|
||||
|
||||
cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
|
||||
|
@ -627,11 +627,6 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
|
||||
BLOCK_SIZE_SB8X8,
|
||||
0, ib,
|
||||
x->plane[0].src_diff);
|
||||
int16_t* const diff =
|
||||
raster_block_offset_int16(xd,
|
||||
BLOCK_SIZE_SB8X8,
|
||||
0, ib,
|
||||
xd->plane[0].diff);
|
||||
int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
|
||||
uint8_t* const dst =
|
||||
raster_block_offset_uint8(xd,
|
||||
@ -703,18 +698,18 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
|
||||
xd->mode_info_context->bmi[ib].as_mode.first =
|
||||
(B_PREDICTION_MODE)(*best_mode);
|
||||
|
||||
// inverse transform
|
||||
if (best_tx_type != DCT_DCT)
|
||||
vp9_short_iht4x4(best_dqcoeff, diff, 8, best_tx_type);
|
||||
else
|
||||
xd->inv_txm4x4(best_dqcoeff, diff, 16);
|
||||
|
||||
vp9_intra4x4_predict(xd, ib,
|
||||
BLOCK_SIZE_SB8X8,
|
||||
*best_mode,
|
||||
dst, xd->plane[0].dst.stride);
|
||||
vp9_recon_b(dst, diff, 8,
|
||||
dst, xd->plane[0].dst.stride);
|
||||
|
||||
// inverse transform
|
||||
if (best_tx_type != DCT_DCT) {
|
||||
vp9_short_iht4x4_add(best_dqcoeff, dst, xd->plane[0].dst.stride,
|
||||
best_tx_type);
|
||||
} else {
|
||||
xd->inv_txm4x4_add(best_dqcoeff, dst, xd->plane[0].dst.stride);
|
||||
}
|
||||
|
||||
return best_rd;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user