Split dsp/x86/inv_txfm_sse2.c
Spin out highbd idct functions. BUG=webm:1412 Change-Id: I0cfe4117c00039b6778c59c022eee79ad089a2af
This commit is contained in:
@@ -231,6 +231,11 @@ DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_add_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_34_add_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_135_add_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_1024_add_neon.c
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_inv_txfm_sse2.h
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct4x4_add_sse2.c
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct8x8_add_sse2.c
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct16x16_add_sse2.c
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct32x32_add_sse2.c
|
||||
endif # !CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
ifeq ($(HAVE_NEON_ASM),yes)
|
||||
@@ -351,6 +356,9 @@ DSP_SRCS-$(HAVE_VSX) += ppc/types_vsx.h
|
||||
DSP_SRCS-$(HAVE_VSX) += ppc/transpose_vsx.h
|
||||
DSP_SRCS-$(HAVE_VSX) += ppc/bitdepth_conversion_vsx.h
|
||||
|
||||
# X86 utilities
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h
|
||||
|
||||
DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
|
||||
|
||||
DSP_SRCS-yes += vpx_dsp_rtcd.c
|
||||
|
||||
244
vpx_dsp/x86/highbd_idct16x16_add_sse2.c
Normal file
244
vpx_dsp/x86/highbd_idct16x16_add_sse2.c
Normal file
@@ -0,0 +1,244 @@
|
||||
/*
|
||||
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
|
||||
#include "vpx_dsp/x86/inv_txfm_sse2.h"
|
||||
#include "vpx_dsp/x86/transpose_sse2.h"
|
||||
#include "vpx_dsp/x86/txfm_common_sse2.h"
|
||||
|
||||
void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||
int stride, int bd) {
|
||||
tran_low_t out[16 * 16];
|
||||
tran_low_t *outptr = out;
|
||||
int i, j, test;
|
||||
__m128i inptr[32];
|
||||
__m128i min_input, max_input, temp1, temp2, sign_bits;
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i rounding = _mm_set1_epi16(32);
|
||||
const __m128i max = _mm_set1_epi16(3155);
|
||||
const __m128i min = _mm_set1_epi16(-3155);
|
||||
int optimised_cols = 0;
|
||||
|
||||
// Load input into __m128i & pack to 16 bits
|
||||
for (i = 0; i < 16; i++) {
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
|
||||
inptr[i] = _mm_packs_epi32(temp1, temp2);
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
|
||||
inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
|
||||
}
|
||||
|
||||
// Find the min & max for the row transform
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 32; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (!test) {
|
||||
// Do the row transform
|
||||
idct16_sse2(inptr, inptr + 16);
|
||||
|
||||
// Find the min & max for the column transform
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 32; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (test) {
|
||||
array_transpose_16x16(inptr, inptr + 16);
|
||||
for (i = 0; i < 16; i++) {
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
|
||||
temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
|
||||
temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
|
||||
temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
|
||||
temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
|
||||
}
|
||||
} else {
|
||||
// Set to use the optimised transform for the column
|
||||
optimised_cols = 1;
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised row transform
|
||||
for (i = 0; i < 16; ++i) {
|
||||
vpx_highbd_idct16_c(input, outptr, bd);
|
||||
input += 16;
|
||||
outptr += 16;
|
||||
}
|
||||
}
|
||||
|
||||
if (optimised_cols) {
|
||||
idct16_sse2(inptr, inptr + 16);
|
||||
|
||||
// Final round & shift and Reconstruction and Store
|
||||
{
|
||||
__m128i d[2];
|
||||
for (i = 0; i < 16; i++) {
|
||||
inptr[i] = _mm_add_epi16(inptr[i], rounding);
|
||||
inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
|
||||
d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
|
||||
d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
|
||||
inptr[i] = _mm_srai_epi16(inptr[i], 6);
|
||||
inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
|
||||
d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
|
||||
d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
|
||||
// Store
|
||||
_mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
|
||||
_mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised column transform
|
||||
tran_low_t temp_in[16], temp_out[16];
|
||||
for (i = 0; i < 16; ++i) {
|
||||
for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
|
||||
vpx_highbd_idct16_c(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 16; ++j) {
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||
int stride, int bd) {
|
||||
tran_low_t out[16 * 16] = { 0 };
|
||||
tran_low_t *outptr = out;
|
||||
int i, j, test;
|
||||
__m128i inptr[32];
|
||||
__m128i min_input, max_input, temp1, temp2, sign_bits;
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i rounding = _mm_set1_epi16(32);
|
||||
const __m128i max = _mm_set1_epi16(3155);
|
||||
const __m128i min = _mm_set1_epi16(-3155);
|
||||
int optimised_cols = 0;
|
||||
|
||||
// Load input into __m128i & pack to 16 bits
|
||||
for (i = 0; i < 16; i++) {
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
|
||||
inptr[i] = _mm_packs_epi32(temp1, temp2);
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
|
||||
inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
|
||||
}
|
||||
|
||||
// Find the min & max for the row transform
|
||||
// Since all non-zero dct coefficients are in upper-left 4x4 area,
|
||||
// we only need to consider first 4 rows here.
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 4; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (!test) {
|
||||
// Do the row transform (N.B. This transposes inptr)
|
||||
idct16_sse2(inptr, inptr + 16);
|
||||
|
||||
// Find the min & max for the column transform
|
||||
// N.B. Only first 4 cols contain non-zero coeffs
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 16; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (test) {
|
||||
// Use fact only first 4 rows contain non-zero coeffs
|
||||
array_transpose_8x8(inptr, inptr);
|
||||
array_transpose_8x8(inptr + 8, inptr + 16);
|
||||
for (i = 0; i < 4; i++) {
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
|
||||
temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
|
||||
temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
|
||||
temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
|
||||
temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
|
||||
}
|
||||
} else {
|
||||
// Set to use the optimised transform for the column
|
||||
optimised_cols = 1;
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised row transform
|
||||
for (i = 0; i < 4; ++i) {
|
||||
vpx_highbd_idct16_c(input, outptr, bd);
|
||||
input += 16;
|
||||
outptr += 16;
|
||||
}
|
||||
}
|
||||
|
||||
if (optimised_cols) {
|
||||
idct16_sse2(inptr, inptr + 16);
|
||||
|
||||
// Final round & shift and Reconstruction and Store
|
||||
{
|
||||
__m128i d[2];
|
||||
for (i = 0; i < 16; i++) {
|
||||
inptr[i] = _mm_add_epi16(inptr[i], rounding);
|
||||
inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
|
||||
d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
|
||||
d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
|
||||
inptr[i] = _mm_srai_epi16(inptr[i], 6);
|
||||
inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
|
||||
d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
|
||||
d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
|
||||
// Store
|
||||
_mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
|
||||
_mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised column transform
|
||||
tran_low_t temp_in[16], temp_out[16];
|
||||
for (i = 0; i < 16; ++i) {
|
||||
for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
|
||||
vpx_highbd_idct16_c(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 16; ++j) {
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
41
vpx_dsp/x86/highbd_idct32x32_add_sse2.c
Normal file
41
vpx_dsp/x86/highbd_idct32x32_add_sse2.c
Normal file
@@ -0,0 +1,41 @@
|
||||
/*
|
||||
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vpx_dsp/x86/inv_txfm_sse2.h"
|
||||
#include "vpx_dsp/x86/transpose_sse2.h"
|
||||
#include "vpx_dsp/x86/txfm_common_sse2.h"
|
||||
|
||||
void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||
int stride, int bd) {
|
||||
__m128i dc_value, d;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i one = _mm_set1_epi16(1);
|
||||
const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
|
||||
int a, i, j;
|
||||
tran_low_t out;
|
||||
|
||||
out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
|
||||
out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
|
||||
a = ROUND_POWER_OF_TWO(out, 6);
|
||||
|
||||
d = _mm_set1_epi32(a);
|
||||
dc_value = _mm_packs_epi32(d, d);
|
||||
for (i = 0; i < 32; ++i) {
|
||||
for (j = 0; j < 4; ++j) {
|
||||
d = _mm_loadu_si128((const __m128i *)(&dest[j * 8]));
|
||||
d = _mm_adds_epi16(d, dc_value);
|
||||
d = _mm_max_epi16(d, zero);
|
||||
d = _mm_min_epi16(d, max);
|
||||
_mm_storeu_si128((__m128i *)(&dest[j * 8]), d);
|
||||
}
|
||||
dest += stride;
|
||||
}
|
||||
}
|
||||
129
vpx_dsp/x86/highbd_idct4x4_add_sse2.c
Normal file
129
vpx_dsp/x86/highbd_idct4x4_add_sse2.c
Normal file
@@ -0,0 +1,129 @@
|
||||
/*
|
||||
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
|
||||
#include "vpx_dsp/x86/inv_txfm_sse2.h"
|
||||
#include "vpx_dsp/x86/transpose_sse2.h"
|
||||
#include "vpx_dsp/x86/txfm_common_sse2.h"
|
||||
|
||||
void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||
int stride, int bd) {
|
||||
tran_low_t out[4 * 4];
|
||||
tran_low_t *outptr = out;
|
||||
int i, j;
|
||||
__m128i inptr[4];
|
||||
__m128i sign_bits[2];
|
||||
__m128i temp_mm, min_input, max_input;
|
||||
int test;
|
||||
int optimised_cols = 0;
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i eight = _mm_set1_epi16(8);
|
||||
const __m128i max = _mm_set1_epi16(12043);
|
||||
const __m128i min = _mm_set1_epi16(-12043);
|
||||
// Load input into __m128i
|
||||
inptr[0] = _mm_loadu_si128((const __m128i *)input);
|
||||
inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
|
||||
inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
|
||||
inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
|
||||
|
||||
// Pack to 16 bits
|
||||
inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
|
||||
inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
|
||||
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp_mm = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp_mm);
|
||||
|
||||
if (!test) {
|
||||
// Do the row transform
|
||||
idct4_sse2(inptr);
|
||||
|
||||
// Check the min & max values
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp_mm = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp_mm);
|
||||
|
||||
if (test) {
|
||||
transpose_4x4(inptr);
|
||||
sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
|
||||
sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
|
||||
inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
|
||||
inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
|
||||
inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
|
||||
inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
|
||||
_mm_storeu_si128((__m128i *)outptr, inptr[0]);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
|
||||
} else {
|
||||
// Set to use the optimised transform for the column
|
||||
optimised_cols = 1;
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised row transform
|
||||
for (i = 0; i < 4; ++i) {
|
||||
vpx_highbd_idct4_c(input, outptr, bd);
|
||||
input += 4;
|
||||
outptr += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (optimised_cols) {
|
||||
idct4_sse2(inptr);
|
||||
|
||||
// Final round and shift
|
||||
inptr[0] = _mm_add_epi16(inptr[0], eight);
|
||||
inptr[1] = _mm_add_epi16(inptr[1], eight);
|
||||
|
||||
inptr[0] = _mm_srai_epi16(inptr[0], 4);
|
||||
inptr[1] = _mm_srai_epi16(inptr[1], 4);
|
||||
|
||||
// Reconstruction and Store
|
||||
{
|
||||
__m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
|
||||
__m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
|
||||
d0 = _mm_unpacklo_epi64(
|
||||
d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
|
||||
d2 = _mm_unpacklo_epi64(
|
||||
d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
|
||||
d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
|
||||
d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
|
||||
// store input0
|
||||
_mm_storel_epi64((__m128i *)dest, d0);
|
||||
// store input1
|
||||
d0 = _mm_srli_si128(d0, 8);
|
||||
_mm_storel_epi64((__m128i *)(dest + stride), d0);
|
||||
// store input2
|
||||
_mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
|
||||
// store input3
|
||||
d2 = _mm_srli_si128(d2, 8);
|
||||
_mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised column transform
|
||||
tran_low_t temp_in[4], temp_out[4];
|
||||
// Columns
|
||||
for (i = 0; i < 4; ++i) {
|
||||
for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
|
||||
vpx_highbd_idct4_c(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 4; ++j) {
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
216
vpx_dsp/x86/highbd_idct8x8_add_sse2.c
Normal file
216
vpx_dsp/x86/highbd_idct8x8_add_sse2.c
Normal file
@@ -0,0 +1,216 @@
|
||||
/*
|
||||
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
|
||||
#include "vpx_dsp/x86/inv_txfm_sse2.h"
|
||||
#include "vpx_dsp/x86/transpose_sse2.h"
|
||||
#include "vpx_dsp/x86/txfm_common_sse2.h"
|
||||
|
||||
void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||
int stride, int bd) {
|
||||
tran_low_t out[8 * 8];
|
||||
tran_low_t *outptr = out;
|
||||
int i, j, test;
|
||||
__m128i inptr[8];
|
||||
__m128i min_input, max_input, temp1, temp2, sign_bits;
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i sixteen = _mm_set1_epi16(16);
|
||||
const __m128i max = _mm_set1_epi16(6201);
|
||||
const __m128i min = _mm_set1_epi16(-6201);
|
||||
int optimised_cols = 0;
|
||||
|
||||
// Load input into __m128i & pack to 16 bits
|
||||
for (i = 0; i < 8; i++) {
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
|
||||
inptr[i] = _mm_packs_epi32(temp1, temp2);
|
||||
}
|
||||
|
||||
// Find the min & max for the row transform
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 8; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (!test) {
|
||||
// Do the row transform
|
||||
idct8_sse2(inptr);
|
||||
|
||||
// Find the min & max for the column transform
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 8; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (test) {
|
||||
array_transpose_8x8(inptr, inptr);
|
||||
for (i = 0; i < 8; i++) {
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
|
||||
temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
|
||||
temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
|
||||
}
|
||||
} else {
|
||||
// Set to use the optimised transform for the column
|
||||
optimised_cols = 1;
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised row transform
|
||||
for (i = 0; i < 8; ++i) {
|
||||
vpx_highbd_idct8_c(input, outptr, bd);
|
||||
input += 8;
|
||||
outptr += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (optimised_cols) {
|
||||
idct8_sse2(inptr);
|
||||
|
||||
// Final round & shift and Reconstruction and Store
|
||||
{
|
||||
__m128i d[8];
|
||||
for (i = 0; i < 8; i++) {
|
||||
inptr[i] = _mm_add_epi16(inptr[i], sixteen);
|
||||
d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
|
||||
inptr[i] = _mm_srai_epi16(inptr[i], 5);
|
||||
d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
|
||||
// Store
|
||||
_mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised column transform
|
||||
tran_low_t temp_in[8], temp_out[8];
|
||||
for (i = 0; i < 8; ++i) {
|
||||
for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
|
||||
vpx_highbd_idct8_c(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 8; ++j) {
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||
int stride, int bd) {
|
||||
tran_low_t out[8 * 8] = { 0 };
|
||||
tran_low_t *outptr = out;
|
||||
int i, j, test;
|
||||
__m128i inptr[8];
|
||||
__m128i min_input, max_input, temp1, temp2, sign_bits;
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i sixteen = _mm_set1_epi16(16);
|
||||
const __m128i max = _mm_set1_epi16(6201);
|
||||
const __m128i min = _mm_set1_epi16(-6201);
|
||||
int optimised_cols = 0;
|
||||
|
||||
// Load input into __m128i & pack to 16 bits
|
||||
for (i = 0; i < 8; i++) {
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
|
||||
inptr[i] = _mm_packs_epi32(temp1, temp2);
|
||||
}
|
||||
|
||||
// Find the min & max for the row transform
|
||||
// only first 4 row has non-zero coefs
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 4; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (!test) {
|
||||
// Do the row transform
|
||||
idct8_sse2(inptr);
|
||||
|
||||
// Find the min & max for the column transform
|
||||
// N.B. Only first 4 cols contain non-zero coeffs
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 8; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (test) {
|
||||
// Use fact only first 4 rows contain non-zero coeffs
|
||||
array_transpose_4X8(inptr, inptr);
|
||||
for (i = 0; i < 4; i++) {
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
|
||||
temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
|
||||
temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
|
||||
}
|
||||
} else {
|
||||
// Set to use the optimised transform for the column
|
||||
optimised_cols = 1;
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised row transform
|
||||
for (i = 0; i < 4; ++i) {
|
||||
vpx_highbd_idct8_c(input, outptr, bd);
|
||||
input += 8;
|
||||
outptr += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (optimised_cols) {
|
||||
idct8_sse2(inptr);
|
||||
|
||||
// Final round & shift and Reconstruction and Store
|
||||
{
|
||||
__m128i d[8];
|
||||
for (i = 0; i < 8; i++) {
|
||||
inptr[i] = _mm_add_epi16(inptr[i], sixteen);
|
||||
d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
|
||||
inptr[i] = _mm_srai_epi16(inptr[i], 5);
|
||||
d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
|
||||
// Store
|
||||
_mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised column transform
|
||||
tran_low_t temp_in[8], temp_out[8];
|
||||
for (i = 0; i < 8; ++i) {
|
||||
for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
|
||||
vpx_highbd_idct8_c(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 8; ++j) {
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
33
vpx_dsp/x86/highbd_inv_txfm_sse2.h
Normal file
33
vpx_dsp/x86/highbd_inv_txfm_sse2.h
Normal file
@@ -0,0 +1,33 @@
|
||||
/*
|
||||
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
|
||||
#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
|
||||
|
||||
#include <emmintrin.h> // SSE2
|
||||
#include "./vpx_config.h"
|
||||
#include "vpx/vpx_integer.h"
|
||||
#include "vpx_dsp/inv_txfm.h"
|
||||
#include "vpx_dsp/x86/txfm_common_sse2.h"
|
||||
|
||||
static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
|
||||
__m128i ubounded, retval;
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i one = _mm_set1_epi16(1);
|
||||
const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
|
||||
ubounded = _mm_cmpgt_epi16(value, max);
|
||||
retval = _mm_andnot_si128(ubounded, value);
|
||||
ubounded = _mm_and_si128(ubounded, max);
|
||||
retval = _mm_or_si128(retval, ubounded);
|
||||
retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
|
||||
return retval;
|
||||
}
|
||||
|
||||
#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
|
||||
@@ -10,6 +10,7 @@
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vpx_dsp/x86/inv_txfm_sse2.h"
|
||||
#include "vpx_dsp/x86/transpose_sse2.h"
|
||||
#include "vpx_dsp/x86/txfm_common_sse2.h"
|
||||
|
||||
#define RECON_AND_STORE4X4(dest, in_x) \
|
||||
@@ -170,14 +171,6 @@ void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
|
||||
RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
|
||||
}
|
||||
|
||||
static INLINE void transpose_4x4(__m128i *res) {
|
||||
const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
|
||||
const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
|
||||
|
||||
res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
|
||||
res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
|
||||
}
|
||||
|
||||
void idct4_sse2(__m128i *in) {
|
||||
const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
|
||||
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
|
||||
@@ -3349,589 +3342,3 @@ void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
|
||||
RECON_AND_STORE(dest + 24 + j * stride, dc_value);
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
|
||||
__m128i ubounded, retval;
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i one = _mm_set1_epi16(1);
|
||||
const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
|
||||
ubounded = _mm_cmpgt_epi16(value, max);
|
||||
retval = _mm_andnot_si128(ubounded, value);
|
||||
ubounded = _mm_and_si128(ubounded, max);
|
||||
retval = _mm_or_si128(retval, ubounded);
|
||||
retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
|
||||
return retval;
|
||||
}
|
||||
|
||||
void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||
int stride, int bd) {
|
||||
tran_low_t out[4 * 4];
|
||||
tran_low_t *outptr = out;
|
||||
int i, j;
|
||||
__m128i inptr[4];
|
||||
__m128i sign_bits[2];
|
||||
__m128i temp_mm, min_input, max_input;
|
||||
int test;
|
||||
int optimised_cols = 0;
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i eight = _mm_set1_epi16(8);
|
||||
const __m128i max = _mm_set1_epi16(12043);
|
||||
const __m128i min = _mm_set1_epi16(-12043);
|
||||
// Load input into __m128i
|
||||
inptr[0] = _mm_loadu_si128((const __m128i *)input);
|
||||
inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
|
||||
inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
|
||||
inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
|
||||
|
||||
// Pack to 16 bits
|
||||
inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
|
||||
inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
|
||||
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp_mm = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp_mm);
|
||||
|
||||
if (!test) {
|
||||
// Do the row transform
|
||||
idct4_sse2(inptr);
|
||||
|
||||
// Check the min & max values
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp_mm = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp_mm);
|
||||
|
||||
if (test) {
|
||||
transpose_4x4(inptr);
|
||||
sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
|
||||
sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
|
||||
inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
|
||||
inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
|
||||
inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
|
||||
inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
|
||||
_mm_storeu_si128((__m128i *)outptr, inptr[0]);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
|
||||
} else {
|
||||
// Set to use the optimised transform for the column
|
||||
optimised_cols = 1;
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised row transform
|
||||
for (i = 0; i < 4; ++i) {
|
||||
vpx_highbd_idct4_c(input, outptr, bd);
|
||||
input += 4;
|
||||
outptr += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (optimised_cols) {
|
||||
idct4_sse2(inptr);
|
||||
|
||||
// Final round and shift
|
||||
inptr[0] = _mm_add_epi16(inptr[0], eight);
|
||||
inptr[1] = _mm_add_epi16(inptr[1], eight);
|
||||
|
||||
inptr[0] = _mm_srai_epi16(inptr[0], 4);
|
||||
inptr[1] = _mm_srai_epi16(inptr[1], 4);
|
||||
|
||||
// Reconstruction and Store
|
||||
{
|
||||
__m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
|
||||
__m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
|
||||
d0 = _mm_unpacklo_epi64(
|
||||
d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
|
||||
d2 = _mm_unpacklo_epi64(
|
||||
d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
|
||||
d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
|
||||
d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
|
||||
// store input0
|
||||
_mm_storel_epi64((__m128i *)dest, d0);
|
||||
// store input1
|
||||
d0 = _mm_srli_si128(d0, 8);
|
||||
_mm_storel_epi64((__m128i *)(dest + stride), d0);
|
||||
// store input2
|
||||
_mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
|
||||
// store input3
|
||||
d2 = _mm_srli_si128(d2, 8);
|
||||
_mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised column transform
|
||||
tran_low_t temp_in[4], temp_out[4];
|
||||
// Columns
|
||||
for (i = 0; i < 4; ++i) {
|
||||
for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
|
||||
vpx_highbd_idct4_c(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 4; ++j) {
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||
int stride, int bd) {
|
||||
tran_low_t out[8 * 8];
|
||||
tran_low_t *outptr = out;
|
||||
int i, j, test;
|
||||
__m128i inptr[8];
|
||||
__m128i min_input, max_input, temp1, temp2, sign_bits;
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i sixteen = _mm_set1_epi16(16);
|
||||
const __m128i max = _mm_set1_epi16(6201);
|
||||
const __m128i min = _mm_set1_epi16(-6201);
|
||||
int optimised_cols = 0;
|
||||
|
||||
// Load input into __m128i & pack to 16 bits
|
||||
for (i = 0; i < 8; i++) {
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
|
||||
inptr[i] = _mm_packs_epi32(temp1, temp2);
|
||||
}
|
||||
|
||||
// Find the min & max for the row transform
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 8; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (!test) {
|
||||
// Do the row transform
|
||||
idct8_sse2(inptr);
|
||||
|
||||
// Find the min & max for the column transform
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 8; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (test) {
|
||||
array_transpose_8x8(inptr, inptr);
|
||||
for (i = 0; i < 8; i++) {
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
|
||||
temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
|
||||
temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
|
||||
}
|
||||
} else {
|
||||
// Set to use the optimised transform for the column
|
||||
optimised_cols = 1;
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised row transform
|
||||
for (i = 0; i < 8; ++i) {
|
||||
vpx_highbd_idct8_c(input, outptr, bd);
|
||||
input += 8;
|
||||
outptr += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (optimised_cols) {
|
||||
idct8_sse2(inptr);
|
||||
|
||||
// Final round & shift and Reconstruction and Store
|
||||
{
|
||||
__m128i d[8];
|
||||
for (i = 0; i < 8; i++) {
|
||||
inptr[i] = _mm_add_epi16(inptr[i], sixteen);
|
||||
d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
|
||||
inptr[i] = _mm_srai_epi16(inptr[i], 5);
|
||||
d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
|
||||
// Store
|
||||
_mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised column transform
|
||||
tran_low_t temp_in[8], temp_out[8];
|
||||
for (i = 0; i < 8; ++i) {
|
||||
for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
|
||||
vpx_highbd_idct8_c(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 8; ++j) {
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||
int stride, int bd) {
|
||||
tran_low_t out[8 * 8] = { 0 };
|
||||
tran_low_t *outptr = out;
|
||||
int i, j, test;
|
||||
__m128i inptr[8];
|
||||
__m128i min_input, max_input, temp1, temp2, sign_bits;
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i sixteen = _mm_set1_epi16(16);
|
||||
const __m128i max = _mm_set1_epi16(6201);
|
||||
const __m128i min = _mm_set1_epi16(-6201);
|
||||
int optimised_cols = 0;
|
||||
|
||||
// Load input into __m128i & pack to 16 bits
|
||||
for (i = 0; i < 8; i++) {
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
|
||||
inptr[i] = _mm_packs_epi32(temp1, temp2);
|
||||
}
|
||||
|
||||
// Find the min & max for the row transform
|
||||
// only first 4 row has non-zero coefs
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 4; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (!test) {
|
||||
// Do the row transform
|
||||
idct8_sse2(inptr);
|
||||
|
||||
// Find the min & max for the column transform
|
||||
// N.B. Only first 4 cols contain non-zero coeffs
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 8; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (test) {
|
||||
// Use fact only first 4 rows contain non-zero coeffs
|
||||
array_transpose_4X8(inptr, inptr);
|
||||
for (i = 0; i < 4; i++) {
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
|
||||
temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
|
||||
temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
|
||||
}
|
||||
} else {
|
||||
// Set to use the optimised transform for the column
|
||||
optimised_cols = 1;
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised row transform
|
||||
for (i = 0; i < 4; ++i) {
|
||||
vpx_highbd_idct8_c(input, outptr, bd);
|
||||
input += 8;
|
||||
outptr += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (optimised_cols) {
|
||||
idct8_sse2(inptr);
|
||||
|
||||
// Final round & shift and Reconstruction and Store
|
||||
{
|
||||
__m128i d[8];
|
||||
for (i = 0; i < 8; i++) {
|
||||
inptr[i] = _mm_add_epi16(inptr[i], sixteen);
|
||||
d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
|
||||
inptr[i] = _mm_srai_epi16(inptr[i], 5);
|
||||
d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
|
||||
// Store
|
||||
_mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised column transform
|
||||
tran_low_t temp_in[8], temp_out[8];
|
||||
for (i = 0; i < 8; ++i) {
|
||||
for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
|
||||
vpx_highbd_idct8_c(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 8; ++j) {
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||
int stride, int bd) {
|
||||
tran_low_t out[16 * 16];
|
||||
tran_low_t *outptr = out;
|
||||
int i, j, test;
|
||||
__m128i inptr[32];
|
||||
__m128i min_input, max_input, temp1, temp2, sign_bits;
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i rounding = _mm_set1_epi16(32);
|
||||
const __m128i max = _mm_set1_epi16(3155);
|
||||
const __m128i min = _mm_set1_epi16(-3155);
|
||||
int optimised_cols = 0;
|
||||
|
||||
// Load input into __m128i & pack to 16 bits
|
||||
for (i = 0; i < 16; i++) {
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
|
||||
inptr[i] = _mm_packs_epi32(temp1, temp2);
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
|
||||
inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
|
||||
}
|
||||
|
||||
// Find the min & max for the row transform
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 32; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (!test) {
|
||||
// Do the row transform
|
||||
idct16_sse2(inptr, inptr + 16);
|
||||
|
||||
// Find the min & max for the column transform
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 32; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (test) {
|
||||
array_transpose_16x16(inptr, inptr + 16);
|
||||
for (i = 0; i < 16; i++) {
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
|
||||
temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
|
||||
temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
|
||||
temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
|
||||
temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
|
||||
}
|
||||
} else {
|
||||
// Set to use the optimised transform for the column
|
||||
optimised_cols = 1;
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised row transform
|
||||
for (i = 0; i < 16; ++i) {
|
||||
vpx_highbd_idct16_c(input, outptr, bd);
|
||||
input += 16;
|
||||
outptr += 16;
|
||||
}
|
||||
}
|
||||
|
||||
if (optimised_cols) {
|
||||
idct16_sse2(inptr, inptr + 16);
|
||||
|
||||
// Final round & shift and Reconstruction and Store
|
||||
{
|
||||
__m128i d[2];
|
||||
for (i = 0; i < 16; i++) {
|
||||
inptr[i] = _mm_add_epi16(inptr[i], rounding);
|
||||
inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
|
||||
d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
|
||||
d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
|
||||
inptr[i] = _mm_srai_epi16(inptr[i], 6);
|
||||
inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
|
||||
d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
|
||||
d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
|
||||
// Store
|
||||
_mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
|
||||
_mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised column transform
|
||||
tran_low_t temp_in[16], temp_out[16];
|
||||
for (i = 0; i < 16; ++i) {
|
||||
for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
|
||||
vpx_highbd_idct16_c(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 16; ++j) {
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||
int stride, int bd) {
|
||||
tran_low_t out[16 * 16] = { 0 };
|
||||
tran_low_t *outptr = out;
|
||||
int i, j, test;
|
||||
__m128i inptr[32];
|
||||
__m128i min_input, max_input, temp1, temp2, sign_bits;
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i rounding = _mm_set1_epi16(32);
|
||||
const __m128i max = _mm_set1_epi16(3155);
|
||||
const __m128i min = _mm_set1_epi16(-3155);
|
||||
int optimised_cols = 0;
|
||||
|
||||
// Load input into __m128i & pack to 16 bits
|
||||
for (i = 0; i < 16; i++) {
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
|
||||
inptr[i] = _mm_packs_epi32(temp1, temp2);
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
|
||||
inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
|
||||
}
|
||||
|
||||
// Find the min & max for the row transform
|
||||
// Since all non-zero dct coefficients are in upper-left 4x4 area,
|
||||
// we only need to consider first 4 rows here.
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 4; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (!test) {
|
||||
// Do the row transform (N.B. This transposes inptr)
|
||||
idct16_sse2(inptr, inptr + 16);
|
||||
|
||||
// Find the min & max for the column transform
|
||||
// N.B. Only first 4 cols contain non-zero coeffs
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 16; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (test) {
|
||||
// Use fact only first 4 rows contain non-zero coeffs
|
||||
array_transpose_8x8(inptr, inptr);
|
||||
array_transpose_8x8(inptr + 8, inptr + 16);
|
||||
for (i = 0; i < 4; i++) {
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
|
||||
temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
|
||||
temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
|
||||
temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
|
||||
temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
|
||||
_mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
|
||||
}
|
||||
} else {
|
||||
// Set to use the optimised transform for the column
|
||||
optimised_cols = 1;
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised row transform
|
||||
for (i = 0; i < 4; ++i) {
|
||||
vpx_highbd_idct16_c(input, outptr, bd);
|
||||
input += 16;
|
||||
outptr += 16;
|
||||
}
|
||||
}
|
||||
|
||||
if (optimised_cols) {
|
||||
idct16_sse2(inptr, inptr + 16);
|
||||
|
||||
// Final round & shift and Reconstruction and Store
|
||||
{
|
||||
__m128i d[2];
|
||||
for (i = 0; i < 16; i++) {
|
||||
inptr[i] = _mm_add_epi16(inptr[i], rounding);
|
||||
inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
|
||||
d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
|
||||
d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
|
||||
inptr[i] = _mm_srai_epi16(inptr[i], 6);
|
||||
inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
|
||||
d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
|
||||
d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
|
||||
// Store
|
||||
_mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
|
||||
_mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised column transform
|
||||
tran_low_t temp_in[16], temp_out[16];
|
||||
for (i = 0; i < 16; ++i) {
|
||||
for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
|
||||
vpx_highbd_idct16_c(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 16; ++j) {
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest,
|
||||
int stride, int bd) {
|
||||
__m128i dc_value, d;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i one = _mm_set1_epi16(1);
|
||||
const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
|
||||
int a, i, j;
|
||||
tran_low_t out;
|
||||
|
||||
out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
|
||||
out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
|
||||
a = ROUND_POWER_OF_TWO(out, 6);
|
||||
|
||||
d = _mm_set1_epi32(a);
|
||||
dc_value = _mm_packs_epi32(d, d);
|
||||
for (i = 0; i < 32; ++i) {
|
||||
for (j = 0; j < 4; ++j) {
|
||||
d = _mm_loadu_si128((const __m128i *)(&dest[j * 8]));
|
||||
d = _mm_adds_epi16(d, dc_value);
|
||||
d = _mm_max_epi16(d, zero);
|
||||
d = _mm_min_epi16(d, max);
|
||||
_mm_storeu_si128((__m128i *)(&dest[j * 8]), d);
|
||||
}
|
||||
dest += stride;
|
||||
}
|
||||
}
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
26
vpx_dsp/x86/transpose_sse2.h
Normal file
26
vpx_dsp/x86/transpose_sse2.h
Normal file
@@ -0,0 +1,26 @@
|
||||
/*
|
||||
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef VPX_DSP_X86_TRANSPOSE_SSE2_H_
|
||||
#define VPX_DSP_X86_TRANSPOSE_SSE2_H_
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vpx_dsp/x86/inv_txfm_sse2.h"
|
||||
#include "vpx_dsp/x86/txfm_common_sse2.h"
|
||||
|
||||
static INLINE void transpose_4x4(__m128i *res) {
|
||||
const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
|
||||
const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
|
||||
|
||||
res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
|
||||
res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
|
||||
}
|
||||
|
||||
#endif // VPX_DSP_X86_TRANSPOSE_SSE2_H_
|
||||
Reference in New Issue
Block a user