Merge "SSE2 version of vectorized 8-tap filtering." into experimental
This commit is contained in:
commit
4580f3371b
@ -14,8 +14,8 @@ prototype void vp8_filter_block2d_16x16_8 "const unsigned char *src_ptr, const u
|
||||
# compiles warning free but a dissassembly of generated code show bugs. To be
|
||||
# on the safe side, only enabled when compiled with 'gcc'.
|
||||
if [ "$CONFIG_GCC" = "yes" ]; then
|
||||
specialize vp8_filter_block2d_4x4_8 sse4_1
|
||||
specialize vp8_filter_block2d_8x4_8 sse4_1
|
||||
specialize vp8_filter_block2d_8x8_8 sse4_1
|
||||
specialize vp8_filter_block2d_16x16_8 sse4_1
|
||||
specialize vp8_filter_block2d_4x4_8 sse4_1 sse2
|
||||
specialize vp8_filter_block2d_8x4_8 sse4_1 sse2
|
||||
specialize vp8_filter_block2d_8x8_8 sse4_1 sse2
|
||||
specialize vp8_filter_block2d_16x16_8 sse4_1 sse2
|
||||
fi
|
||||
|
289
vp8/common/x86/filter_sse2.c
Normal file
289
vp8/common/x86/filter_sse2.c
Normal file
@ -0,0 +1,289 @@
|
||||
/*
|
||||
* Copyright (c) 2012 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <assert.h> // for alignment checks
|
||||
#include <emmintrin.h> // SSE2
|
||||
#include "vp8/common/filter.h"
|
||||
#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
|
||||
#include "vpx_rtcd.h"
|
||||
|
||||
// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
|
||||
// just a quick partial snapshot so that other can already use some
|
||||
// speedup.
|
||||
// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
|
||||
// filtering.
|
||||
// TODO(cd): Add some comments, better variable naming.
|
||||
// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
|
||||
// of positive above 128), or have higher precision filter
|
||||
// coefficients.
|
||||
|
||||
DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
|
||||
VP8_FILTER_WEIGHT >> 1,
|
||||
VP8_FILTER_WEIGHT >> 1,
|
||||
VP8_FILTER_WEIGHT >> 1,
|
||||
VP8_FILTER_WEIGHT >> 1,
|
||||
};
|
||||
|
||||
// Creating a macro to do more than four pixels at once to hide instruction
|
||||
// latency is actually slower :-(
|
||||
#define DO_FOUR_PIXELS(result, src_ptr, offset) \
|
||||
{ \
|
||||
/* Do shifted load to achieve require shuffles through unpacking */ \
|
||||
const __m128i src0 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \
|
||||
const __m128i src1 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \
|
||||
const __m128i src2 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \
|
||||
const __m128i src3 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \
|
||||
const __m128i src01 = _mm_unpacklo_epi8(src0, src1); \
|
||||
const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero); \
|
||||
const __m128i src23 = _mm_unpacklo_epi8(src2, src3); \
|
||||
const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero); \
|
||||
/* Shit by 4 bytes through suffle to get additional shifted loads */ \
|
||||
const __m128i src4 = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1)); \
|
||||
const __m128i src5 = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1)); \
|
||||
const __m128i src6 = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1)); \
|
||||
const __m128i src7 = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1)); \
|
||||
const __m128i src45 = _mm_unpacklo_epi8(src4, src5); \
|
||||
const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero); \
|
||||
const __m128i src67 = _mm_unpacklo_epi8(src6, src7); \
|
||||
const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero); \
|
||||
/* multiply accumulate them */ \
|
||||
const __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \
|
||||
const __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \
|
||||
const __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \
|
||||
const __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \
|
||||
const __m128i mad0123 = _mm_add_epi32(mad01, mad23); \
|
||||
const __m128i mad4567 = _mm_add_epi32(mad45, mad67); \
|
||||
__m128i mad_all = _mm_add_epi32(mad0123, mad4567); \
|
||||
mad_all = _mm_add_epi32(mad_all, rounding); \
|
||||
result = _mm_srai_epi32(mad_all, VP8_FILTER_SHIFT); \
|
||||
}
|
||||
|
||||
void vp8_filter_block2d_4x4_8_sse2
|
||||
(
|
||||
const unsigned char *src_ptr, const unsigned int src_stride,
|
||||
const short *HFilter_aligned16, const short *VFilter_aligned16,
|
||||
unsigned char *dst_ptr, unsigned int dst_stride
|
||||
) {
|
||||
__m128i intermediateA, intermediateB, intermediateC;
|
||||
|
||||
const int kInterp_Extend = 4;
|
||||
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
|
||||
|
||||
// check alignment
|
||||
assert(0 == ((long)HFilter_aligned16)%16);
|
||||
assert(0 == ((long)VFilter_aligned16)%16);
|
||||
|
||||
{
|
||||
__m128i transpose3_0;
|
||||
__m128i transpose3_1;
|
||||
__m128i transpose3_2;
|
||||
__m128i transpose3_3;
|
||||
|
||||
// Horizontal pass (src -> intermediate).
|
||||
{
|
||||
const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
|
||||
// get first two columns filter coefficients
|
||||
__m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
__m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
__m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
__m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
|
||||
|
||||
{
|
||||
__m128i mad_all0;
|
||||
__m128i mad_all1;
|
||||
__m128i mad_all2;
|
||||
__m128i mad_all3;
|
||||
DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
|
||||
DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
|
||||
DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
|
||||
DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
|
||||
mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
|
||||
mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
|
||||
intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
|
||||
// --
|
||||
src_ptr += src_stride*4;
|
||||
// --
|
||||
DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
|
||||
DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
|
||||
DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
|
||||
DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
|
||||
mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
|
||||
mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
|
||||
intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
|
||||
// --
|
||||
src_ptr += src_stride*4;
|
||||
// --
|
||||
DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
|
||||
DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
|
||||
DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
|
||||
mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
|
||||
mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
|
||||
intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
|
||||
}
|
||||
}
|
||||
|
||||
// Transpose result (intermediate -> transpose3_x)
|
||||
{
|
||||
// 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
|
||||
// 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
|
||||
// 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
|
||||
const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB);
|
||||
const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB);
|
||||
const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC);
|
||||
const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC);
|
||||
// 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53
|
||||
// 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73
|
||||
// 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx
|
||||
// A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx
|
||||
const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
|
||||
const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
|
||||
const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3);
|
||||
const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3);
|
||||
// 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63
|
||||
// 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73
|
||||
// 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx
|
||||
// 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx
|
||||
const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1);
|
||||
const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1);
|
||||
const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3);
|
||||
const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3);
|
||||
// 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
|
||||
// 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
|
||||
// 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx
|
||||
// 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx
|
||||
transpose3_0 = _mm_castps_si128(
|
||||
_mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
|
||||
_mm_castsi128_ps(transpose2_2),
|
||||
_MM_SHUFFLE(1, 0, 1, 0)));
|
||||
transpose3_1 = _mm_castps_si128(
|
||||
_mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
|
||||
_mm_castsi128_ps(transpose2_2),
|
||||
_MM_SHUFFLE(3, 2, 3, 2)));
|
||||
transpose3_2 = _mm_castps_si128(
|
||||
_mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
|
||||
_mm_castsi128_ps(transpose2_3),
|
||||
_MM_SHUFFLE(1, 0, 1, 0)));
|
||||
transpose3_3 = _mm_castps_si128(
|
||||
_mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
|
||||
_mm_castsi128_ps(transpose2_3),
|
||||
_MM_SHUFFLE(3, 2, 3, 2)));
|
||||
// 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
|
||||
// 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
|
||||
// 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
|
||||
// 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
|
||||
}
|
||||
|
||||
// Vertical pass (transpose3_x -> dst).
|
||||
{
|
||||
const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
|
||||
// get first two columns filter coefficients
|
||||
__m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
__m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
__m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
__m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
__m128i col0, col1, col2, col3;
|
||||
DECLARE_ALIGNED(16, unsigned char, temp[32]);
|
||||
{
|
||||
_mm_store_si128((__m128i *)temp, transpose3_0);
|
||||
DO_FOUR_PIXELS(col0, temp, 0);
|
||||
}
|
||||
{
|
||||
_mm_store_si128((__m128i *)temp, transpose3_1);
|
||||
DO_FOUR_PIXELS(col1, temp, 0);
|
||||
}
|
||||
{
|
||||
_mm_store_si128((__m128i *)temp, transpose3_2);
|
||||
DO_FOUR_PIXELS(col2, temp, 0);
|
||||
}
|
||||
{
|
||||
_mm_store_si128((__m128i *)temp, transpose3_3);
|
||||
DO_FOUR_PIXELS(col3, temp, 0);
|
||||
}
|
||||
// transpose
|
||||
{
|
||||
__m128i T0 = _mm_unpacklo_epi32(col0, col1);
|
||||
__m128i T1 = _mm_unpacklo_epi32(col2, col3);
|
||||
__m128i T2 = _mm_unpackhi_epi32(col0, col1);
|
||||
__m128i T3 = _mm_unpackhi_epi32(col2, col3);
|
||||
col0 = _mm_unpacklo_epi64(T0, T1);
|
||||
col1 = _mm_unpackhi_epi64(T0, T1);
|
||||
col2 = _mm_unpacklo_epi64(T2, T3);
|
||||
col3 = _mm_unpackhi_epi64(T2, T3);
|
||||
}
|
||||
// saturate to 8 bit
|
||||
{
|
||||
col0 = _mm_packs_epi32(col0, col0);
|
||||
col0 = _mm_packus_epi16(col0, col0);
|
||||
col1 = _mm_packs_epi32(col1, col1);
|
||||
col1 = _mm_packus_epi16(col1, col1);
|
||||
col2 = _mm_packs_epi32 (col2, col2);
|
||||
col2 = _mm_packus_epi16(col2, col2);
|
||||
col3 = _mm_packs_epi32 (col3, col3);
|
||||
col3 = _mm_packus_epi16(col3, col3);
|
||||
}
|
||||
// store
|
||||
{
|
||||
*((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0);
|
||||
*((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1);
|
||||
*((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2);
|
||||
*((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_filter_block2d_8x4_8_sse2
|
||||
(
|
||||
const unsigned char *src_ptr, const unsigned int src_stride,
|
||||
const short *HFilter_aligned16, const short *VFilter_aligned16,
|
||||
unsigned char *dst_ptr, unsigned int dst_stride
|
||||
) {
|
||||
int j;
|
||||
for (j=0; j<8; j+=4) {
|
||||
vp8_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride,
|
||||
HFilter_aligned16, VFilter_aligned16,
|
||||
dst_ptr + j, dst_stride);
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_filter_block2d_8x8_8_sse2
|
||||
(
|
||||
const unsigned char *src_ptr, const unsigned int src_stride,
|
||||
const short *HFilter_aligned16, const short *VFilter_aligned16,
|
||||
unsigned char *dst_ptr, unsigned int dst_stride
|
||||
) {
|
||||
int i, j;
|
||||
for (i=0; i<8; i+=4) {
|
||||
for (j=0; j<8; j+=4) {
|
||||
vp8_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
|
||||
HFilter_aligned16, VFilter_aligned16,
|
||||
dst_ptr + j + i*dst_stride, dst_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_filter_block2d_16x16_8_sse2
|
||||
(
|
||||
const unsigned char *src_ptr, const unsigned int src_stride,
|
||||
const short *HFilter_aligned16, const short *VFilter_aligned16,
|
||||
unsigned char *dst_ptr, unsigned int dst_stride
|
||||
) {
|
||||
int i, j;
|
||||
for (i=0; i<16; i+=4) {
|
||||
for (j=0; j<16; j+=4) {
|
||||
vp8_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
|
||||
HFilter_aligned16, VFilter_aligned16,
|
||||
dst_ptr + j + i*dst_stride, dst_stride);
|
||||
}
|
||||
}
|
||||
}
|
@ -25,9 +25,6 @@
|
||||
// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
|
||||
// of positive above 128), or have higher precision filter
|
||||
// coefficients.
|
||||
// TODO(cd): Remove use of _mm_extract_epi32 and _mm_extract_epi64, to not
|
||||
// require SSE4.1
|
||||
// TODO(cd): Remove use of _mm_shuffle_epi8 to not require SSSE3
|
||||
|
||||
DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = {
|
||||
0x00, 0x01,
|
||||
|
@ -119,6 +119,11 @@ ifeq ($(HAVE_SSE4_1),yes)
|
||||
vp8/common/x86/filter_sse4.c.o: CFLAGS += -msse4
|
||||
endif
|
||||
|
||||
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/filter_sse2.c
|
||||
ifeq ($(HAVE_SSE2),yes)
|
||||
vp8/common/x86/filter_sse2.c.o: CFLAGS += -msse2
|
||||
endif
|
||||
|
||||
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/arm_systemdependent.c
|
||||
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/bilinearfilter_arm.c
|
||||
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/bilinearfilter_arm.h
|
||||
|
Loading…
x
Reference in New Issue
Block a user