Split dsp/x86/inv_txfm_sse2.c

Spin out highbd idct functions.

BUG=webm:1412

Change-Id: I0cfe4117c00039b6778c59c022eee79ad089a2af
This commit is contained in:
Linfeng Zhang
2017-05-03 15:43:02 -07:00
parent d5de63d2be
commit 2231669a83
8 changed files with 698 additions and 594 deletions

View File

@@ -231,6 +231,11 @@ DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_34_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_135_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_1024_add_neon.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_inv_txfm_sse2.h
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct4x4_add_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct8x8_add_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct16x16_add_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct32x32_add_sse2.c
endif # !CONFIG_VP9_HIGHBITDEPTH
ifeq ($(HAVE_NEON_ASM),yes)
@@ -351,6 +356,9 @@ DSP_SRCS-$(HAVE_VSX) += ppc/types_vsx.h
DSP_SRCS-$(HAVE_VSX) += ppc/transpose_vsx.h
DSP_SRCS-$(HAVE_VSX) += ppc/bitdepth_conversion_vsx.h
# X86 utilities
DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h
DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
DSP_SRCS-yes += vpx_dsp_rtcd.c

View File

@@ -0,0 +1,244 @@
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
tran_low_t out[16 * 16];
tran_low_t *outptr = out;
int i, j, test;
__m128i inptr[32];
__m128i min_input, max_input, temp1, temp2, sign_bits;
const __m128i zero = _mm_set1_epi16(0);
const __m128i rounding = _mm_set1_epi16(32);
const __m128i max = _mm_set1_epi16(3155);
const __m128i min = _mm_set1_epi16(-3155);
int optimised_cols = 0;
// Load input into __m128i & pack to 16 bits
for (i = 0; i < 16; i++) {
temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
inptr[i] = _mm_packs_epi32(temp1, temp2);
temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
}
// Find the min & max for the row transform
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 32; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (!test) {
// Do the row transform
idct16_sse2(inptr, inptr + 16);
// Find the min & max for the column transform
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 32; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (test) {
array_transpose_16x16(inptr, inptr + 16);
for (i = 0; i < 16; i++) {
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
}
} else {
// Set to use the optimised transform for the column
optimised_cols = 1;
}
} else {
// Run the un-optimised row transform
for (i = 0; i < 16; ++i) {
vpx_highbd_idct16_c(input, outptr, bd);
input += 16;
outptr += 16;
}
}
if (optimised_cols) {
idct16_sse2(inptr, inptr + 16);
// Final round & shift and Reconstruction and Store
{
__m128i d[2];
for (i = 0; i < 16; i++) {
inptr[i] = _mm_add_epi16(inptr[i], rounding);
inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
inptr[i] = _mm_srai_epi16(inptr[i], 6);
inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
// Store
_mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
_mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
}
}
} else {
// Run the un-optimised column transform
tran_low_t temp_in[16], temp_out[16];
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
vpx_highbd_idct16_c(temp_in, temp_out, bd);
for (j = 0; j < 16; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
}
}
}
}
void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
tran_low_t out[16 * 16] = { 0 };
tran_low_t *outptr = out;
int i, j, test;
__m128i inptr[32];
__m128i min_input, max_input, temp1, temp2, sign_bits;
const __m128i zero = _mm_set1_epi16(0);
const __m128i rounding = _mm_set1_epi16(32);
const __m128i max = _mm_set1_epi16(3155);
const __m128i min = _mm_set1_epi16(-3155);
int optimised_cols = 0;
// Load input into __m128i & pack to 16 bits
for (i = 0; i < 16; i++) {
temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
inptr[i] = _mm_packs_epi32(temp1, temp2);
temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
}
// Find the min & max for the row transform
// Since all non-zero dct coefficients are in upper-left 4x4 area,
// we only need to consider first 4 rows here.
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 4; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (!test) {
// Do the row transform (N.B. This transposes inptr)
idct16_sse2(inptr, inptr + 16);
// Find the min & max for the column transform
// N.B. Only first 4 cols contain non-zero coeffs
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 16; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (test) {
// Use fact only first 4 rows contain non-zero coeffs
array_transpose_8x8(inptr, inptr);
array_transpose_8x8(inptr + 8, inptr + 16);
for (i = 0; i < 4; i++) {
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
}
} else {
// Set to use the optimised transform for the column
optimised_cols = 1;
}
} else {
// Run the un-optimised row transform
for (i = 0; i < 4; ++i) {
vpx_highbd_idct16_c(input, outptr, bd);
input += 16;
outptr += 16;
}
}
if (optimised_cols) {
idct16_sse2(inptr, inptr + 16);
// Final round & shift and Reconstruction and Store
{
__m128i d[2];
for (i = 0; i < 16; i++) {
inptr[i] = _mm_add_epi16(inptr[i], rounding);
inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
inptr[i] = _mm_srai_epi16(inptr[i], 6);
inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
// Store
_mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
_mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
}
}
} else {
// Run the un-optimised column transform
tran_low_t temp_in[16], temp_out[16];
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
vpx_highbd_idct16_c(temp_in, temp_out, bd);
for (j = 0; j < 16; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
}
}
}
}

View File

@@ -0,0 +1,41 @@
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
__m128i dc_value, d;
const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi16(1);
const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
int a, i, j;
tran_low_t out;
out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
a = ROUND_POWER_OF_TWO(out, 6);
d = _mm_set1_epi32(a);
dc_value = _mm_packs_epi32(d, d);
for (i = 0; i < 32; ++i) {
for (j = 0; j < 4; ++j) {
d = _mm_loadu_si128((const __m128i *)(&dest[j * 8]));
d = _mm_adds_epi16(d, dc_value);
d = _mm_max_epi16(d, zero);
d = _mm_min_epi16(d, max);
_mm_storeu_si128((__m128i *)(&dest[j * 8]), d);
}
dest += stride;
}
}

View File

@@ -0,0 +1,129 @@
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
tran_low_t out[4 * 4];
tran_low_t *outptr = out;
int i, j;
__m128i inptr[4];
__m128i sign_bits[2];
__m128i temp_mm, min_input, max_input;
int test;
int optimised_cols = 0;
const __m128i zero = _mm_set1_epi16(0);
const __m128i eight = _mm_set1_epi16(8);
const __m128i max = _mm_set1_epi16(12043);
const __m128i min = _mm_set1_epi16(-12043);
// Load input into __m128i
inptr[0] = _mm_loadu_si128((const __m128i *)input);
inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
// Pack to 16 bits
inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp_mm = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp_mm);
if (!test) {
// Do the row transform
idct4_sse2(inptr);
// Check the min & max values
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp_mm = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp_mm);
if (test) {
transpose_4x4(inptr);
sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
_mm_storeu_si128((__m128i *)outptr, inptr[0]);
_mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
_mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
_mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
} else {
// Set to use the optimised transform for the column
optimised_cols = 1;
}
} else {
// Run the un-optimised row transform
for (i = 0; i < 4; ++i) {
vpx_highbd_idct4_c(input, outptr, bd);
input += 4;
outptr += 4;
}
}
if (optimised_cols) {
idct4_sse2(inptr);
// Final round and shift
inptr[0] = _mm_add_epi16(inptr[0], eight);
inptr[1] = _mm_add_epi16(inptr[1], eight);
inptr[0] = _mm_srai_epi16(inptr[0], 4);
inptr[1] = _mm_srai_epi16(inptr[1], 4);
// Reconstruction and Store
{
__m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
__m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
d0 = _mm_unpacklo_epi64(
d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
d2 = _mm_unpacklo_epi64(
d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
// store input0
_mm_storel_epi64((__m128i *)dest, d0);
// store input1
d0 = _mm_srli_si128(d0, 8);
_mm_storel_epi64((__m128i *)(dest + stride), d0);
// store input2
_mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
// store input3
d2 = _mm_srli_si128(d2, 8);
_mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
}
} else {
// Run the un-optimised column transform
tran_low_t temp_in[4], temp_out[4];
// Columns
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
vpx_highbd_idct4_c(temp_in, temp_out, bd);
for (j = 0; j < 4; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
}
}
}
}

View File

@@ -0,0 +1,216 @@
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
tran_low_t out[8 * 8];
tran_low_t *outptr = out;
int i, j, test;
__m128i inptr[8];
__m128i min_input, max_input, temp1, temp2, sign_bits;
const __m128i zero = _mm_set1_epi16(0);
const __m128i sixteen = _mm_set1_epi16(16);
const __m128i max = _mm_set1_epi16(6201);
const __m128i min = _mm_set1_epi16(-6201);
int optimised_cols = 0;
// Load input into __m128i & pack to 16 bits
for (i = 0; i < 8; i++) {
temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
inptr[i] = _mm_packs_epi32(temp1, temp2);
}
// Find the min & max for the row transform
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 8; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (!test) {
// Do the row transform
idct8_sse2(inptr);
// Find the min & max for the column transform
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 8; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (test) {
array_transpose_8x8(inptr, inptr);
for (i = 0; i < 8; i++) {
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
_mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
_mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
}
} else {
// Set to use the optimised transform for the column
optimised_cols = 1;
}
} else {
// Run the un-optimised row transform
for (i = 0; i < 8; ++i) {
vpx_highbd_idct8_c(input, outptr, bd);
input += 8;
outptr += 8;
}
}
if (optimised_cols) {
idct8_sse2(inptr);
// Final round & shift and Reconstruction and Store
{
__m128i d[8];
for (i = 0; i < 8; i++) {
inptr[i] = _mm_add_epi16(inptr[i], sixteen);
d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
inptr[i] = _mm_srai_epi16(inptr[i], 5);
d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
// Store
_mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
}
}
} else {
// Run the un-optimised column transform
tran_low_t temp_in[8], temp_out[8];
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
vpx_highbd_idct8_c(temp_in, temp_out, bd);
for (j = 0; j < 8; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
}
}
}
}
void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
tran_low_t out[8 * 8] = { 0 };
tran_low_t *outptr = out;
int i, j, test;
__m128i inptr[8];
__m128i min_input, max_input, temp1, temp2, sign_bits;
const __m128i zero = _mm_set1_epi16(0);
const __m128i sixteen = _mm_set1_epi16(16);
const __m128i max = _mm_set1_epi16(6201);
const __m128i min = _mm_set1_epi16(-6201);
int optimised_cols = 0;
// Load input into __m128i & pack to 16 bits
for (i = 0; i < 8; i++) {
temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
inptr[i] = _mm_packs_epi32(temp1, temp2);
}
// Find the min & max for the row transform
// only first 4 row has non-zero coefs
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 4; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (!test) {
// Do the row transform
idct8_sse2(inptr);
// Find the min & max for the column transform
// N.B. Only first 4 cols contain non-zero coeffs
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 8; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (test) {
// Use fact only first 4 rows contain non-zero coeffs
array_transpose_4X8(inptr, inptr);
for (i = 0; i < 4; i++) {
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
_mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
_mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
}
} else {
// Set to use the optimised transform for the column
optimised_cols = 1;
}
} else {
// Run the un-optimised row transform
for (i = 0; i < 4; ++i) {
vpx_highbd_idct8_c(input, outptr, bd);
input += 8;
outptr += 8;
}
}
if (optimised_cols) {
idct8_sse2(inptr);
// Final round & shift and Reconstruction and Store
{
__m128i d[8];
for (i = 0; i < 8; i++) {
inptr[i] = _mm_add_epi16(inptr[i], sixteen);
d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
inptr[i] = _mm_srai_epi16(inptr[i], 5);
d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
// Store
_mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
}
}
} else {
// Run the un-optimised column transform
tran_low_t temp_in[8], temp_out[8];
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
vpx_highbd_idct8_c(temp_in, temp_out, bd);
for (j = 0; j < 8; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
}
}
}
}

View File

@@ -0,0 +1,33 @@
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
#include <emmintrin.h> // SSE2
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
#include "vpx_dsp/inv_txfm.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
__m128i ubounded, retval;
const __m128i zero = _mm_set1_epi16(0);
const __m128i one = _mm_set1_epi16(1);
const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
ubounded = _mm_cmpgt_epi16(value, max);
retval = _mm_andnot_si128(ubounded, value);
ubounded = _mm_and_si128(ubounded, max);
retval = _mm_or_si128(retval, ubounded);
retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
return retval;
}
#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_

View File

@@ -10,6 +10,7 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
#define RECON_AND_STORE4X4(dest, in_x) \
@@ -170,14 +171,6 @@ void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
}
static INLINE void transpose_4x4(__m128i *res) {
const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
}
void idct4_sse2(__m128i *in) {
const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
@@ -3349,589 +3342,3 @@ void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
RECON_AND_STORE(dest + 24 + j * stride, dc_value);
}
}
#if CONFIG_VP9_HIGHBITDEPTH
static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
__m128i ubounded, retval;
const __m128i zero = _mm_set1_epi16(0);
const __m128i one = _mm_set1_epi16(1);
const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
ubounded = _mm_cmpgt_epi16(value, max);
retval = _mm_andnot_si128(ubounded, value);
ubounded = _mm_and_si128(ubounded, max);
retval = _mm_or_si128(retval, ubounded);
retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
return retval;
}
void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
tran_low_t out[4 * 4];
tran_low_t *outptr = out;
int i, j;
__m128i inptr[4];
__m128i sign_bits[2];
__m128i temp_mm, min_input, max_input;
int test;
int optimised_cols = 0;
const __m128i zero = _mm_set1_epi16(0);
const __m128i eight = _mm_set1_epi16(8);
const __m128i max = _mm_set1_epi16(12043);
const __m128i min = _mm_set1_epi16(-12043);
// Load input into __m128i
inptr[0] = _mm_loadu_si128((const __m128i *)input);
inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
// Pack to 16 bits
inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp_mm = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp_mm);
if (!test) {
// Do the row transform
idct4_sse2(inptr);
// Check the min & max values
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp_mm = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp_mm);
if (test) {
transpose_4x4(inptr);
sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
_mm_storeu_si128((__m128i *)outptr, inptr[0]);
_mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
_mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
_mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
} else {
// Set to use the optimised transform for the column
optimised_cols = 1;
}
} else {
// Run the un-optimised row transform
for (i = 0; i < 4; ++i) {
vpx_highbd_idct4_c(input, outptr, bd);
input += 4;
outptr += 4;
}
}
if (optimised_cols) {
idct4_sse2(inptr);
// Final round and shift
inptr[0] = _mm_add_epi16(inptr[0], eight);
inptr[1] = _mm_add_epi16(inptr[1], eight);
inptr[0] = _mm_srai_epi16(inptr[0], 4);
inptr[1] = _mm_srai_epi16(inptr[1], 4);
// Reconstruction and Store
{
__m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
__m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
d0 = _mm_unpacklo_epi64(
d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
d2 = _mm_unpacklo_epi64(
d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
// store input0
_mm_storel_epi64((__m128i *)dest, d0);
// store input1
d0 = _mm_srli_si128(d0, 8);
_mm_storel_epi64((__m128i *)(dest + stride), d0);
// store input2
_mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
// store input3
d2 = _mm_srli_si128(d2, 8);
_mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
}
} else {
// Run the un-optimised column transform
tran_low_t temp_in[4], temp_out[4];
// Columns
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
vpx_highbd_idct4_c(temp_in, temp_out, bd);
for (j = 0; j < 4; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
}
}
}
}
void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
tran_low_t out[8 * 8];
tran_low_t *outptr = out;
int i, j, test;
__m128i inptr[8];
__m128i min_input, max_input, temp1, temp2, sign_bits;
const __m128i zero = _mm_set1_epi16(0);
const __m128i sixteen = _mm_set1_epi16(16);
const __m128i max = _mm_set1_epi16(6201);
const __m128i min = _mm_set1_epi16(-6201);
int optimised_cols = 0;
// Load input into __m128i & pack to 16 bits
for (i = 0; i < 8; i++) {
temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
inptr[i] = _mm_packs_epi32(temp1, temp2);
}
// Find the min & max for the row transform
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 8; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (!test) {
// Do the row transform
idct8_sse2(inptr);
// Find the min & max for the column transform
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 8; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (test) {
array_transpose_8x8(inptr, inptr);
for (i = 0; i < 8; i++) {
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
_mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
_mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
}
} else {
// Set to use the optimised transform for the column
optimised_cols = 1;
}
} else {
// Run the un-optimised row transform
for (i = 0; i < 8; ++i) {
vpx_highbd_idct8_c(input, outptr, bd);
input += 8;
outptr += 8;
}
}
if (optimised_cols) {
idct8_sse2(inptr);
// Final round & shift and Reconstruction and Store
{
__m128i d[8];
for (i = 0; i < 8; i++) {
inptr[i] = _mm_add_epi16(inptr[i], sixteen);
d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
inptr[i] = _mm_srai_epi16(inptr[i], 5);
d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
// Store
_mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
}
}
} else {
// Run the un-optimised column transform
tran_low_t temp_in[8], temp_out[8];
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
vpx_highbd_idct8_c(temp_in, temp_out, bd);
for (j = 0; j < 8; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
}
}
}
}
void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
tran_low_t out[8 * 8] = { 0 };
tran_low_t *outptr = out;
int i, j, test;
__m128i inptr[8];
__m128i min_input, max_input, temp1, temp2, sign_bits;
const __m128i zero = _mm_set1_epi16(0);
const __m128i sixteen = _mm_set1_epi16(16);
const __m128i max = _mm_set1_epi16(6201);
const __m128i min = _mm_set1_epi16(-6201);
int optimised_cols = 0;
// Load input into __m128i & pack to 16 bits
for (i = 0; i < 8; i++) {
temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
inptr[i] = _mm_packs_epi32(temp1, temp2);
}
// Find the min & max for the row transform
// only first 4 row has non-zero coefs
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 4; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (!test) {
// Do the row transform
idct8_sse2(inptr);
// Find the min & max for the column transform
// N.B. Only first 4 cols contain non-zero coeffs
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 8; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (test) {
// Use fact only first 4 rows contain non-zero coeffs
array_transpose_4X8(inptr, inptr);
for (i = 0; i < 4; i++) {
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
_mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
_mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
}
} else {
// Set to use the optimised transform for the column
optimised_cols = 1;
}
} else {
// Run the un-optimised row transform
for (i = 0; i < 4; ++i) {
vpx_highbd_idct8_c(input, outptr, bd);
input += 8;
outptr += 8;
}
}
if (optimised_cols) {
idct8_sse2(inptr);
// Final round & shift and Reconstruction and Store
{
__m128i d[8];
for (i = 0; i < 8; i++) {
inptr[i] = _mm_add_epi16(inptr[i], sixteen);
d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
inptr[i] = _mm_srai_epi16(inptr[i], 5);
d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
// Store
_mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
}
}
} else {
// Run the un-optimised column transform
tran_low_t temp_in[8], temp_out[8];
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
vpx_highbd_idct8_c(temp_in, temp_out, bd);
for (j = 0; j < 8; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
}
}
}
}
void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
tran_low_t out[16 * 16];
tran_low_t *outptr = out;
int i, j, test;
__m128i inptr[32];
__m128i min_input, max_input, temp1, temp2, sign_bits;
const __m128i zero = _mm_set1_epi16(0);
const __m128i rounding = _mm_set1_epi16(32);
const __m128i max = _mm_set1_epi16(3155);
const __m128i min = _mm_set1_epi16(-3155);
int optimised_cols = 0;
// Load input into __m128i & pack to 16 bits
for (i = 0; i < 16; i++) {
temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
inptr[i] = _mm_packs_epi32(temp1, temp2);
temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
}
// Find the min & max for the row transform
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 32; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (!test) {
// Do the row transform
idct16_sse2(inptr, inptr + 16);
// Find the min & max for the column transform
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 32; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (test) {
array_transpose_16x16(inptr, inptr + 16);
for (i = 0; i < 16; i++) {
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
}
} else {
// Set to use the optimised transform for the column
optimised_cols = 1;
}
} else {
// Run the un-optimised row transform
for (i = 0; i < 16; ++i) {
vpx_highbd_idct16_c(input, outptr, bd);
input += 16;
outptr += 16;
}
}
if (optimised_cols) {
idct16_sse2(inptr, inptr + 16);
// Final round & shift and Reconstruction and Store
{
__m128i d[2];
for (i = 0; i < 16; i++) {
inptr[i] = _mm_add_epi16(inptr[i], rounding);
inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
inptr[i] = _mm_srai_epi16(inptr[i], 6);
inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
// Store
_mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
_mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
}
}
} else {
// Run the un-optimised column transform
tran_low_t temp_in[16], temp_out[16];
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
vpx_highbd_idct16_c(temp_in, temp_out, bd);
for (j = 0; j < 16; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
}
}
}
}
void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
tran_low_t out[16 * 16] = { 0 };
tran_low_t *outptr = out;
int i, j, test;
__m128i inptr[32];
__m128i min_input, max_input, temp1, temp2, sign_bits;
const __m128i zero = _mm_set1_epi16(0);
const __m128i rounding = _mm_set1_epi16(32);
const __m128i max = _mm_set1_epi16(3155);
const __m128i min = _mm_set1_epi16(-3155);
int optimised_cols = 0;
// Load input into __m128i & pack to 16 bits
for (i = 0; i < 16; i++) {
temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
inptr[i] = _mm_packs_epi32(temp1, temp2);
temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
}
// Find the min & max for the row transform
// Since all non-zero dct coefficients are in upper-left 4x4 area,
// we only need to consider first 4 rows here.
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 4; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (!test) {
// Do the row transform (N.B. This transposes inptr)
idct16_sse2(inptr, inptr + 16);
// Find the min & max for the column transform
// N.B. Only first 4 cols contain non-zero coeffs
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 16; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (test) {
// Use fact only first 4 rows contain non-zero coeffs
array_transpose_8x8(inptr, inptr);
array_transpose_8x8(inptr + 8, inptr + 16);
for (i = 0; i < 4; i++) {
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
}
} else {
// Set to use the optimised transform for the column
optimised_cols = 1;
}
} else {
// Run the un-optimised row transform
for (i = 0; i < 4; ++i) {
vpx_highbd_idct16_c(input, outptr, bd);
input += 16;
outptr += 16;
}
}
if (optimised_cols) {
idct16_sse2(inptr, inptr + 16);
// Final round & shift and Reconstruction and Store
{
__m128i d[2];
for (i = 0; i < 16; i++) {
inptr[i] = _mm_add_epi16(inptr[i], rounding);
inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
inptr[i] = _mm_srai_epi16(inptr[i], 6);
inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
// Store
_mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
_mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
}
}
} else {
// Run the un-optimised column transform
tran_low_t temp_in[16], temp_out[16];
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
vpx_highbd_idct16_c(temp_in, temp_out, bd);
for (j = 0; j < 16; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
}
}
}
}
void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
__m128i dc_value, d;
const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi16(1);
const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
int a, i, j;
tran_low_t out;
out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
a = ROUND_POWER_OF_TWO(out, 6);
d = _mm_set1_epi32(a);
dc_value = _mm_packs_epi32(d, d);
for (i = 0; i < 32; ++i) {
for (j = 0; j < 4; ++j) {
d = _mm_loadu_si128((const __m128i *)(&dest[j * 8]));
d = _mm_adds_epi16(d, dc_value);
d = _mm_max_epi16(d, zero);
d = _mm_min_epi16(d, max);
_mm_storeu_si128((__m128i *)(&dest[j * 8]), d);
}
dest += stride;
}
}
#endif // CONFIG_VP9_HIGHBITDEPTH

View File

@@ -0,0 +1,26 @@
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VPX_DSP_X86_TRANSPOSE_SSE2_H_
#define VPX_DSP_X86_TRANSPOSE_SSE2_H_
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
static INLINE void transpose_4x4(__m128i *res) {
const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
}
#endif // VPX_DSP_X86_TRANSPOSE_SSE2_H_