vpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
Zoe Liu 7186a2dd86 Code refactor on InterpKernel
It in essence refactors the code for both the interpolation
filtering and the convolution. This change includes the moving
of all the files as well as the changing of the code from vp9_
prefix to vpx_ prefix accordingly, for underneath architectures:
(1) x86;
(2) arm/neon; and
(3) mips/msa.
The work on mips/drsp2 will be done in a separate change list.

Change-Id: Ic3ce7fb7f81210db7628b373c73553db68793c46
2015-07-31 10:27:33 -07:00

602 lines
26 KiB
C

/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
// Due to a header conflict between math.h and intrinsics includes with ceil()
// in certain configurations under vs9 this include needs to precede
// tmmintrin.h.
#include <tmmintrin.h>
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/x86/convolve.h"
#include "vpx_ports/mem.h"
#include "vpx_ports/emmintrin_compat.h"
// filters only for the 4_h8 convolution
DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
};
DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
};
// filters for 8_h8 and 16_h8
DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
};
DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
};
DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
};
DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
};
// These are reused by the avx2 intrinsics.
filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
ptrdiff_t src_pixels_per_line,
uint8_t *output_ptr,
ptrdiff_t output_pitch,
uint32_t output_height,
const int16_t *filter) {
__m128i firstFilters, secondFilters, shuffle1, shuffle2;
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
__m128i addFilterReg64, filtersReg, srcReg, minReg;
unsigned int i;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
// duplicate only the first 16 bits in the filter into the first lane
firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
// duplicate only the third 16 bit in the filter into the first lane
secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
// duplicate only the seconds 16 bits in the filter into the second lane
// firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
// duplicate only the forth 16 bits in the filter into the second lane
// secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
// loading the local filters
shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
for (i = 0; i < output_height; i++) {
srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
// filter the source buffer
srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
// extract the higher half of the lane
srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
// add and saturate all the results together
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
// shift by 7 bit each 16 bits
srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
// shrink to 8 bit each 16 bits
srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
src_ptr+=src_pixels_per_line;
// save only 4 bytes
*((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
output_ptr+=output_pitch;
}
}
void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
ptrdiff_t src_pixels_per_line,
uint8_t *output_ptr,
ptrdiff_t output_pitch,
uint32_t output_height,
const int16_t *filter) {
__m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
__m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
__m128i addFilterReg64, filtersReg, minReg;
unsigned int i;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
// duplicate only the first 16 bits (first and second byte)
// across 128 bit register
firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
// duplicate only the second 16 bits (third and forth byte)
// across 128 bit register
secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
// duplicate only the third 16 bits (fifth and sixth byte)
// across 128 bit register
thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
// duplicate only the forth 16 bits (seventh and eighth byte)
// across 128 bit register
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
for (i = 0; i < output_height; i++) {
srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
// filter the source buffer
srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
// filter the source buffer
srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
// add and saturate all the results together
minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
// shift by 7 bit each 16 bits
srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
// shrink to 8 bit each 16 bits
srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
src_ptr+=src_pixels_per_line;
// save only 8 bytes
_mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
output_ptr+=output_pitch;
}
}
static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,
ptrdiff_t src_pixels_per_line,
uint8_t *output_ptr,
ptrdiff_t output_pitch,
uint32_t output_height,
const int16_t *filter) {
__m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
__m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
__m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
unsigned int i;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
// duplicate only the first 16 bits (first and second byte)
// across 128 bit register
firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
// duplicate only the second 16 bits (third and forth byte)
// across 128 bit register
secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
// duplicate only the third 16 bits (fifth and sixth byte)
// across 128 bit register
thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
// duplicate only the forth 16 bits (seventh and eighth byte)
// across 128 bit register
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
for (i = 0; i < output_height; i++) {
srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
// filter the source buffer
srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
// add and saturate the results together
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
// filter the source buffer
srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);
srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
// add and saturate the results together
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
_mm_min_epi16(srcRegFilt3, srcRegFilt2));
// reading the next 16 bytes.
// (part of it was being read by earlier read)
srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
// add and saturate the results together
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
_mm_max_epi16(srcRegFilt3, srcRegFilt2));
// filter the source buffer
srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
// add and saturate the results together
srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
// filter the source buffer
srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg);
srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
// add and saturate the results together
srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
_mm_min_epi16(srcRegFilt3, srcRegFilt2));
srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
_mm_max_epi16(srcRegFilt3, srcRegFilt2));
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
// shift by 7 bit each 16 bit
srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
// shrink to 8 bit each 16 bits, the first lane contain the first
// convolve result and the second lane contain the second convolve
// result
srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
src_ptr+=src_pixels_per_line;
// save 16 bytes
_mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
output_ptr+=output_pitch;
}
}
void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
ptrdiff_t src_pitch,
uint8_t *output_ptr,
ptrdiff_t out_pitch,
uint32_t output_height,
const int16_t *filter) {
__m128i addFilterReg64, filtersReg, minReg;
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
__m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
__m128i srcReg8;
unsigned int i;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
// duplicate only the first 16 bits in the filter
firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
// duplicate only the second 16 bits in the filter
secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
// duplicate only the third 16 bits in the filter
thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
// duplicate only the forth 16 bits in the filter
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
// load the first 7 rows of 8 bytes
srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
for (i = 0; i < output_height; i++) {
// load the last 8 bytes
srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
// merge the result together
srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
// merge the result together
srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
// add and saturate the results together
minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
// shift by 7 bit each 16 bit
srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
// shrink to 8 bit each 16 bits
srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
src_ptr+=src_pitch;
// shift down a row
srcReg1 = srcReg2;
srcReg2 = srcReg3;
srcReg3 = srcReg4;
srcReg4 = srcReg5;
srcReg5 = srcReg6;
srcReg6 = srcReg7;
srcReg7 = srcReg8;
// save only 8 bytes convolve result
_mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
output_ptr+=out_pitch;
}
}
static void vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,
ptrdiff_t src_pitch,
uint8_t *output_ptr,
ptrdiff_t out_pitch,
uint32_t output_height,
const int16_t *filter) {
__m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
__m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
__m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
__m128i srcReg8;
unsigned int i;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
// duplicate only the first 16 bits in the filter
firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
// duplicate only the second 16 bits in the filter
secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
// duplicate only the third 16 bits in the filter
thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
// duplicate only the forth 16 bits in the filter
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
// load the first 7 rows of 16 bytes
srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
for (i = 0; i < output_height; i++) {
// load the last 16 bytes
srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
// merge the result together
srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8);
srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2);
srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
// add and saturate the results together
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
// merge the result together
srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
// merge the result together
srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6);
srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
// add and saturate the results together
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
_mm_min_epi16(srcRegFilt3, srcRegFilt7));
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
_mm_min_epi16(srcRegFilt6, srcRegFilt8));
// add and saturate the results together
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
_mm_max_epi16(srcRegFilt3, srcRegFilt7));
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
_mm_max_epi16(srcRegFilt6, srcRegFilt8));
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
// shift by 7 bit each 16 bit
srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
// shrink to 8 bit each 16 bits, the first lane contain the first
// convolve result and the second lane contain the second convolve
// result
srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
src_ptr+=src_pitch;
// shift down a row
srcReg1 = srcReg2;
srcReg2 = srcReg3;
srcReg3 = srcReg4;
srcReg4 = srcReg5;
srcReg5 = srcReg6;
srcReg6 = srcReg7;
srcReg7 = srcReg8;
// save 16 bytes convolve result
_mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
output_ptr+=out_pitch;
}
}
#if ARCH_X86_64
filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3;
filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3;
filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
#define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3
#define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3
#define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3
#define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3
#define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3
#else // ARCH_X86
filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
#endif // ARCH_X86_64
filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
// void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
// void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
// void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
// void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
ssse3);
// void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
// void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
FUN_CONV_2D(, ssse3);
FUN_CONV_2D(avg_ , ssse3);