Merge "WIP: 8x8 idct/recon merge" into experimental
This commit is contained in:
commit
9aa37a51b2
@ -16,6 +16,7 @@
|
||||
|
||||
extern "C" {
|
||||
#include "vp9_rtcd.h"
|
||||
void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch);
|
||||
}
|
||||
|
||||
#include "acm_random.h"
|
||||
@ -100,11 +101,15 @@ TEST(VP9Fdct8x8Test, RoundTripErrorCheck) {
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
int16_t test_input_block[64];
|
||||
int16_t test_temp_block[64];
|
||||
int16_t test_output_block[64];
|
||||
uint8_t dst[64], src[64];
|
||||
|
||||
for (int j = 0; j < 64; ++j) {
|
||||
src[j] = rnd.Rand8();
|
||||
dst[j] = rnd.Rand8();
|
||||
}
|
||||
// Initialize a test block with input range [-255, 255].
|
||||
for (int j = 0; j < 64; ++j)
|
||||
test_input_block[j] = rnd.Rand8() - rnd.Rand8();
|
||||
test_input_block[j] = src[j] - dst[j];
|
||||
|
||||
const int pitch = 16;
|
||||
vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
|
||||
@ -119,10 +124,10 @@ TEST(VP9Fdct8x8Test, RoundTripErrorCheck) {
|
||||
test_temp_block[j] *= 4;
|
||||
}
|
||||
}
|
||||
vp9_short_idct8x8_c(test_temp_block, test_output_block, pitch);
|
||||
vp9_short_idct8x8_add_c(test_temp_block, dst, 8);
|
||||
|
||||
for (int j = 0; j < 64; ++j) {
|
||||
const int diff = test_input_block[j] - test_output_block[j];
|
||||
const int diff = dst[j] - src[j];
|
||||
const int error = diff * diff;
|
||||
if (max_error < error)
|
||||
max_error = error;
|
||||
@ -145,18 +150,22 @@ TEST(VP9Fdct8x8Test, ExtremalCheck) {
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
int16_t test_input_block[64];
|
||||
int16_t test_temp_block[64];
|
||||
int16_t test_output_block[64];
|
||||
uint8_t dst[64], src[64];
|
||||
|
||||
// Initialize a test block with input range {-255, 255}.
|
||||
for (int j = 0; j < 64; ++j) {
|
||||
src[j] = rnd.Rand8() % 2 ? 255 : 0;
|
||||
dst[j] = src[j] > 0 ? 0 : 255;
|
||||
}
|
||||
// Initialize a test block with input range [-255, 255].
|
||||
for (int j = 0; j < 64; ++j)
|
||||
test_input_block[j] = rnd.Rand8() % 2 ? 255 : -256;
|
||||
test_input_block[j] = src[j] - dst[j];
|
||||
|
||||
const int pitch = 16;
|
||||
vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
|
||||
vp9_short_idct8x8_c(test_temp_block, test_output_block, pitch);
|
||||
vp9_short_idct8x8_add_c(test_temp_block, dst, 8);
|
||||
|
||||
for (int j = 0; j < 64; ++j) {
|
||||
const int diff = test_input_block[j] - test_output_block[j];
|
||||
const int diff = dst[j] - src[j];
|
||||
const int error = diff * diff;
|
||||
if (max_error < error)
|
||||
max_error = error;
|
||||
|
@ -112,20 +112,23 @@ TEST(VP9Idct8x8Test, AccuracyCheck) {
|
||||
const int count_test_block = 10000;
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
int16_t input[64], coeff[64];
|
||||
int16_t output_c[64];
|
||||
double output_r[64];
|
||||
uint8_t dst[64], src[64];
|
||||
|
||||
for (int j = 0; j < 64; ++j) {
|
||||
src[j] = rnd.Rand8();
|
||||
dst[j] = rnd.Rand8();
|
||||
}
|
||||
// Initialize a test block with input range [-255, 255].
|
||||
for (int j = 0; j < 64; ++j)
|
||||
input[j] = rnd.Rand8() - rnd.Rand8();
|
||||
input[j] = src[j] - dst[j];
|
||||
|
||||
const int pitch = 16;
|
||||
reference_dct_2d(input, output_r);
|
||||
for (int j = 0; j < 64; ++j)
|
||||
coeff[j] = round(output_r[j]);
|
||||
vp9_short_idct8x8_c(coeff, output_c, pitch);
|
||||
vp9_short_idct8x8_add_c(coeff, dst, 8);
|
||||
for (int j = 0; j < 64; ++j) {
|
||||
const int diff = output_c[j] -input[j];
|
||||
const int diff = dst[j] - src[j];
|
||||
const int error = diff * diff;
|
||||
EXPECT_GE(1, error)
|
||||
<< "Error: 8x8 FDCT/IDCT has error " << error
|
||||
|
@ -219,27 +219,27 @@ static void idct8_1d(int16_t *input, int16_t *output) {
|
||||
output[7] = step1[0] - step1[7];
|
||||
}
|
||||
|
||||
void vp9_short_idct8x8_c(int16_t *input, int16_t *output, int pitch) {
|
||||
void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
|
||||
int16_t out[8 * 8];
|
||||
int16_t *outptr = out;
|
||||
const int half_pitch = pitch >> 1;
|
||||
int i, j;
|
||||
int16_t temp_in[8], temp_out[8];
|
||||
|
||||
// Rows
|
||||
// First transform rows
|
||||
for (i = 0; i < 8; ++i) {
|
||||
idct8_1d(input, outptr);
|
||||
input += 8;
|
||||
outptr += 8;
|
||||
}
|
||||
|
||||
// Columns
|
||||
// Then transform columns
|
||||
for (i = 0; i < 8; ++i) {
|
||||
for (j = 0; j < 8; ++j)
|
||||
temp_in[j] = out[j * 8 + i];
|
||||
idct8_1d(temp_in, temp_out);
|
||||
for (j = 0; j < 8; ++j)
|
||||
output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
|
||||
dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
|
||||
+ dest[j * dest_stride + i]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -400,8 +400,8 @@ static const transform_2d IHT_8[] = {
|
||||
{ iadst8_1d, iadst8_1d } // ADST_ADST = 3
|
||||
};
|
||||
|
||||
void vp9_short_iht8x8_c(int16_t *input, int16_t *output,
|
||||
int pitch, int tx_type) {
|
||||
void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride,
|
||||
int tx_type) {
|
||||
int i, j;
|
||||
int16_t out[8 * 8];
|
||||
int16_t *outptr = out;
|
||||
@ -421,14 +421,14 @@ void vp9_short_iht8x8_c(int16_t *input, int16_t *output,
|
||||
temp_in[j] = out[j * 8 + i];
|
||||
ht.cols(temp_in, temp_out);
|
||||
for (j = 0; j < 8; ++j)
|
||||
output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
|
||||
}
|
||||
dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
|
||||
+ dest[j * dest_stride + i]); }
|
||||
}
|
||||
|
||||
void vp9_short_idct10_8x8_c(int16_t *input, int16_t *output, int pitch) {
|
||||
void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
|
||||
int dest_stride) {
|
||||
int16_t out[8 * 8];
|
||||
int16_t *outptr = out;
|
||||
const int half_pitch = pitch >> 1;
|
||||
int i, j;
|
||||
int16_t temp_in[8], temp_out[8];
|
||||
|
||||
@ -447,7 +447,8 @@ void vp9_short_idct10_8x8_c(int16_t *input, int16_t *output, int pitch) {
|
||||
temp_in[j] = out[j * 8 + i];
|
||||
idct8_1d(temp_in, temp_out);
|
||||
for (j = 0; j < 8; ++j)
|
||||
output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
|
||||
dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
|
||||
+ dest[j * dest_stride + i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -88,9 +88,6 @@ if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
|
||||
prototype void vp9_add_residual_4x4 "const int16_t *diff, uint8_t *dest, int stride"
|
||||
specialize vp9_add_residual_4x4 sse2
|
||||
|
||||
prototype void vp9_add_residual_8x8 "const int16_t *diff, uint8_t *dest, int stride"
|
||||
specialize vp9_add_residual_8x8 sse2
|
||||
|
||||
prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
|
||||
specialize vp9_add_constant_residual_8x8 sse2
|
||||
|
||||
@ -188,11 +185,11 @@ specialize vp9_short_idct4x4_1
|
||||
prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch"
|
||||
specialize vp9_short_idct4x4 sse2
|
||||
|
||||
prototype void vp9_short_idct8x8 "int16_t *input, int16_t *output, int pitch"
|
||||
specialize vp9_short_idct8x8 sse2
|
||||
prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
|
||||
specialize vp9_short_idct8x8_add sse2
|
||||
|
||||
prototype void vp9_short_idct10_8x8 "int16_t *input, int16_t *output, int pitch"
|
||||
specialize vp9_short_idct10_8x8 sse2
|
||||
prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
|
||||
specialize vp9_short_idct10_8x8_add sse2
|
||||
|
||||
prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
|
||||
specialize vp9_short_idct1_8x8
|
||||
@ -215,8 +212,8 @@ specialize vp9_short_idct1_32x32
|
||||
prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
|
||||
specialize vp9_short_idct10_32x32_add
|
||||
|
||||
prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type"
|
||||
specialize vp9_short_iht8x8
|
||||
prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
|
||||
specialize vp9_short_iht8x8_add
|
||||
|
||||
prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
|
||||
specialize vp9_short_iht4x4
|
||||
|
@ -403,8 +403,18 @@ void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
|
||||
in6 = _mm_subs_epi16(stp1_1, stp1_6); \
|
||||
in7 = _mm_subs_epi16(stp1_0, stp2_7);
|
||||
|
||||
void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
|
||||
const int half_pitch = pitch >> 1;
|
||||
#define RECON_AND_STORE(dest, in_x) \
|
||||
{ \
|
||||
__m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
|
||||
d0 = _mm_unpacklo_epi8(d0, zero); \
|
||||
in_x = _mm_add_epi16(in_x, d0); \
|
||||
in_x = _mm_packus_epi16(in_x, in_x); \
|
||||
_mm_storel_epi64((__m128i *)(dest), in_x); \
|
||||
dest += stride; \
|
||||
}
|
||||
|
||||
void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
|
||||
const __m128i final_rounding = _mm_set1_epi16(1<<4);
|
||||
const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
|
||||
@ -461,19 +471,17 @@ void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
|
||||
in6 = _mm_srai_epi16(in6, 5);
|
||||
in7 = _mm_srai_epi16(in7, 5);
|
||||
|
||||
// Store results
|
||||
_mm_store_si128((__m128i *)output, in0);
|
||||
_mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
|
||||
_mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
|
||||
_mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
|
||||
_mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
|
||||
_mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
|
||||
_mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
|
||||
_mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
|
||||
RECON_AND_STORE(dest, in0);
|
||||
RECON_AND_STORE(dest, in1);
|
||||
RECON_AND_STORE(dest, in2);
|
||||
RECON_AND_STORE(dest, in3);
|
||||
RECON_AND_STORE(dest, in4);
|
||||
RECON_AND_STORE(dest, in5);
|
||||
RECON_AND_STORE(dest, in6);
|
||||
RECON_AND_STORE(dest, in7);
|
||||
}
|
||||
|
||||
void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
|
||||
const int half_pitch = pitch >> 1;
|
||||
void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
|
||||
const __m128i final_rounding = _mm_set1_epi16(1<<4);
|
||||
@ -612,15 +620,14 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
|
||||
in6 = _mm_srai_epi16(in6, 5);
|
||||
in7 = _mm_srai_epi16(in7, 5);
|
||||
|
||||
// Store results
|
||||
_mm_store_si128((__m128i *)output, in0);
|
||||
_mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
|
||||
_mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
|
||||
_mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
|
||||
_mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
|
||||
_mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
|
||||
_mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
|
||||
_mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
|
||||
RECON_AND_STORE(dest, in0);
|
||||
RECON_AND_STORE(dest, in1);
|
||||
RECON_AND_STORE(dest, in2);
|
||||
RECON_AND_STORE(dest, in3);
|
||||
RECON_AND_STORE(dest, in4);
|
||||
RECON_AND_STORE(dest, in5);
|
||||
RECON_AND_STORE(dest, in6);
|
||||
RECON_AND_STORE(dest, in7);
|
||||
}
|
||||
|
||||
#define IDCT16x16_1D \
|
||||
@ -752,16 +759,6 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
|
||||
stp2_10, stp2_13, stp2_11, stp2_12) \
|
||||
}
|
||||
|
||||
#define RECON_AND_STORE(dest, in_x) \
|
||||
{ \
|
||||
__m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
|
||||
d0 = _mm_unpacklo_epi8(d0, zero); \
|
||||
in_x = _mm_add_epi16(in_x, d0); \
|
||||
in_x = _mm_packus_epi16(in_x, in_x); \
|
||||
_mm_storel_epi64((__m128i *)(dest), in_x); \
|
||||
dest += stride; \
|
||||
}
|
||||
|
||||
void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
|
||||
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
|
||||
const __m128i final_rounding = _mm_set1_epi16(1<<5);
|
||||
|
@ -101,10 +101,6 @@ void vp9_add_residual_4x4_c(const int16_t *diff, uint8_t *dest, int stride) {
|
||||
add_residual(diff, dest, stride, 4, 4);
|
||||
}
|
||||
|
||||
void vp9_add_residual_8x8_c(const int16_t *diff, uint8_t *dest, int stride) {
|
||||
add_residual(diff, dest, stride, 8, 8);
|
||||
}
|
||||
|
||||
static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
|
||||
int width, int height) {
|
||||
int r, c;
|
||||
@ -151,11 +147,8 @@ void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
|
||||
vp9_idct_add_8x8(input, dest, stride, eob);
|
||||
} else {
|
||||
if (eob > 0) {
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64);
|
||||
|
||||
vp9_short_iht8x8(input, output, 8, tx_type);
|
||||
vp9_short_iht8x8_add(input, dest, stride, tx_type);
|
||||
vpx_memset(input, 0, 128);
|
||||
vp9_add_residual_8x8(output, dest, stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -210,8 +203,6 @@ void vp9_dc_idct_add_lossless_c(int16_t *input, uint8_t *dest,
|
||||
}
|
||||
|
||||
void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64);
|
||||
|
||||
// If dc is 1, then input[0] is the reconstructed value, do not need
|
||||
// dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
|
||||
|
||||
@ -233,20 +224,15 @@ void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
|
||||
vp9_add_constant_residual_8x8(out, dest, stride);
|
||||
#if !CONFIG_SCATTERSCAN
|
||||
} else if (eob <= 10) {
|
||||
vp9_short_idct10_8x8(input, output, 16);
|
||||
|
||||
vp9_short_idct10_8x8_add(input, dest, stride);
|
||||
input[0] = input[1] = input[2] = input[3] = 0;
|
||||
input[8] = input[9] = input[10] = 0;
|
||||
input[16] = input[17] = 0;
|
||||
input[24] = 0;
|
||||
|
||||
vp9_add_residual_8x8(output, dest, stride);
|
||||
#endif
|
||||
} else {
|
||||
// the idct halves ( >> 1) the pitch
|
||||
vp9_short_idct8x8(input, output, 8 << 1);
|
||||
vp9_short_idct8x8_add(input, dest, stride);
|
||||
vpx_memset(input, 0, 128);
|
||||
vp9_add_residual_8x8(output, dest, stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -58,70 +58,6 @@ void vp9_add_residual_4x4_sse2(const int16_t *diff, uint8_t *dest, int stride) {
|
||||
*(int *)dest = _mm_cvtsi128_si32(p2);
|
||||
}
|
||||
|
||||
void vp9_add_residual_8x8_sse2(const int16_t *diff, uint8_t *dest, int stride) {
|
||||
const int width = 8;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
||||
// Diff data
|
||||
const __m128i d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
|
||||
const __m128i d1 = _mm_load_si128((const __m128i *)(diff + 1 * width));
|
||||
const __m128i d2 = _mm_load_si128((const __m128i *)(diff + 2 * width));
|
||||
const __m128i d3 = _mm_load_si128((const __m128i *)(diff + 3 * width));
|
||||
const __m128i d4 = _mm_load_si128((const __m128i *)(diff + 4 * width));
|
||||
const __m128i d5 = _mm_load_si128((const __m128i *)(diff + 5 * width));
|
||||
const __m128i d6 = _mm_load_si128((const __m128i *)(diff + 6 * width));
|
||||
const __m128i d7 = _mm_load_si128((const __m128i *)(diff + 7 * width));
|
||||
|
||||
// Prediction data.
|
||||
__m128i p0 = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
|
||||
__m128i p1 = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));
|
||||
__m128i p2 = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));
|
||||
__m128i p3 = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));
|
||||
__m128i p4 = _mm_loadl_epi64((const __m128i *)(dest + 4 * stride));
|
||||
__m128i p5 = _mm_loadl_epi64((const __m128i *)(dest + 5 * stride));
|
||||
__m128i p6 = _mm_loadl_epi64((const __m128i *)(dest + 6 * stride));
|
||||
__m128i p7 = _mm_loadl_epi64((const __m128i *)(dest + 7 * stride));
|
||||
|
||||
p0 = _mm_unpacklo_epi8(p0, zero);
|
||||
p1 = _mm_unpacklo_epi8(p1, zero);
|
||||
p2 = _mm_unpacklo_epi8(p2, zero);
|
||||
p3 = _mm_unpacklo_epi8(p3, zero);
|
||||
p4 = _mm_unpacklo_epi8(p4, zero);
|
||||
p5 = _mm_unpacklo_epi8(p5, zero);
|
||||
p6 = _mm_unpacklo_epi8(p6, zero);
|
||||
p7 = _mm_unpacklo_epi8(p7, zero);
|
||||
|
||||
p0 = _mm_add_epi16(p0, d0);
|
||||
p1 = _mm_add_epi16(p1, d1);
|
||||
p2 = _mm_add_epi16(p2, d2);
|
||||
p3 = _mm_add_epi16(p3, d3);
|
||||
p4 = _mm_add_epi16(p4, d4);
|
||||
p5 = _mm_add_epi16(p5, d5);
|
||||
p6 = _mm_add_epi16(p6, d6);
|
||||
p7 = _mm_add_epi16(p7, d7);
|
||||
|
||||
p0 = _mm_packus_epi16(p0, p1);
|
||||
p2 = _mm_packus_epi16(p2, p3);
|
||||
p4 = _mm_packus_epi16(p4, p5);
|
||||
p6 = _mm_packus_epi16(p6, p7);
|
||||
|
||||
_mm_storel_epi64((__m128i *)(dest + 0 * stride), p0);
|
||||
p0 = _mm_srli_si128(p0, 8);
|
||||
_mm_storel_epi64((__m128i *)(dest + 1 * stride), p0);
|
||||
|
||||
_mm_storel_epi64((__m128i *)(dest + 2 * stride), p2);
|
||||
p2 = _mm_srli_si128(p2, 8);
|
||||
_mm_storel_epi64((__m128i *)(dest + 3 * stride), p2);
|
||||
|
||||
_mm_storel_epi64((__m128i *)(dest + 4 * stride), p4);
|
||||
p4 = _mm_srli_si128(p4, 8);
|
||||
_mm_storel_epi64((__m128i *)(dest + 5 * stride), p4);
|
||||
|
||||
_mm_storel_epi64((__m128i *)(dest + 6 * stride), p6);
|
||||
p6 = _mm_srli_si128(p6, 8);
|
||||
_mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
|
||||
}
|
||||
|
||||
void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
|
||||
int stride) {
|
||||
uint8_t abs_diff;
|
||||
|
@ -534,11 +534,12 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
|
||||
case TX_8X8:
|
||||
tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
|
||||
if (tx_type == DCT_DCT) {
|
||||
vp9_short_idct8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
|
||||
diff, bw * 2);
|
||||
vp9_short_idct8x8_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
|
||||
block, 16), dst, xd->plane[plane].dst.stride);
|
||||
} else {
|
||||
vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
|
||||
diff, bw, tx_type);
|
||||
vp9_short_iht8x8_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
|
||||
block, 16), dst, xd->plane[plane].dst.stride,
|
||||
tx_type);
|
||||
}
|
||||
*wip_txfrm_size = 8;
|
||||
break;
|
||||
@ -589,7 +590,7 @@ void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x,
|
||||
|
||||
foreach_transformed_block_in_plane(xd, bsize, 0,
|
||||
encode_block, &arg);
|
||||
if (wip_txfrm_size < 32)
|
||||
if (wip_txfrm_size < 8)
|
||||
vp9_recon_sby(xd, bsize);
|
||||
}
|
||||
|
||||
@ -606,7 +607,7 @@ void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
|
||||
|
||||
foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
|
||||
|
||||
if (wip_txfrm_size < 16)
|
||||
if (wip_txfrm_size < 8)
|
||||
vp9_recon_sbuv(xd, bsize);
|
||||
}
|
||||
|
||||
@ -628,13 +629,13 @@ void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
|
||||
// wip version... will use foreach_transformed_block when done
|
||||
foreach_transformed_block_in_plane(xd, bsize, 0,
|
||||
encode_block, &arg);
|
||||
if (wip_txfrm_size < 16)
|
||||
if (wip_txfrm_size < 8)
|
||||
vp9_recon_sby(xd, bsize);
|
||||
wip_txfrm_size = 0;
|
||||
|
||||
foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
|
||||
|
||||
if (wip_txfrm_size < 16)
|
||||
if (wip_txfrm_size < 8)
|
||||
vp9_recon_sbuv(xd, bsize);
|
||||
#endif
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user