c5443fc881
In 8-tap filtering, to guarantee the intermediate results fit in 16 bits, the order of accumulating the products needs to be done correctly, and the largest product should be added last. This patch fixed the problem using the method in commit "Correct ssse3 8/16-pixel wide sub-pixel filter calculation". Change-Id: I79d0ad60c057b15011ece84cda9648eee0809423
493 lines
21 KiB
C
493 lines
21 KiB
C
/*
|
|
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#include <tmmintrin.h>
|
|
#include "vpx_ports/mem.h"
|
|
#include "vpx_ports/emmintrin_compat.h"
|
|
|
|
// filters only for the 4_h8 convolution
|
|
DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
|
|
0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
|
|
};
|
|
|
|
DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
|
|
4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
|
|
};
|
|
|
|
// filters for 8_h8 and 16_h8
|
|
DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
|
|
0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
|
|
};
|
|
|
|
DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
|
|
2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
|
|
};
|
|
|
|
DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
|
|
4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
|
|
};
|
|
|
|
DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
|
|
6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
|
|
};
|
|
|
|
void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
|
|
unsigned int src_pixels_per_line,
|
|
unsigned char *output_ptr,
|
|
unsigned int output_pitch,
|
|
unsigned int output_height,
|
|
int16_t *filter) {
|
|
__m128i firstFilters, secondFilters, shuffle1, shuffle2;
|
|
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
|
|
__m128i addFilterReg64, filtersReg, srcReg, minReg;
|
|
unsigned int i;
|
|
|
|
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
|
|
addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
|
|
filtersReg = _mm_loadu_si128((__m128i *)filter);
|
|
// converting the 16 bit (short) to 8 bit (byte) and have the same data
|
|
// in both lanes of 128 bit register.
|
|
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
|
|
|
|
// duplicate only the first 16 bits in the filter into the first lane
|
|
firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
|
|
// duplicate only the third 16 bit in the filter into the first lane
|
|
secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
|
|
// duplicate only the seconds 16 bits in the filter into the second lane
|
|
// firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
|
|
firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
|
|
// duplicate only the forth 16 bits in the filter into the second lane
|
|
// secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
|
|
secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
|
|
|
|
// loading the local filters
|
|
shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
|
|
shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
|
|
|
|
for (i = 0; i < output_height; i++) {
|
|
srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
|
|
|
|
// filter the source buffer
|
|
srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
|
|
srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
|
|
|
|
// multiply 2 adjacent elements with the filter and add the result
|
|
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
|
|
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
|
|
|
|
// extract the higher half of the lane
|
|
srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
|
|
srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
|
|
|
|
minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
|
|
|
|
// add and saturate all the results together
|
|
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
|
|
srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
|
|
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
|
|
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
|
|
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
|
|
|
|
// shift by 7 bit each 16 bits
|
|
srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
|
|
|
|
// shrink to 8 bit each 16 bits
|
|
srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
|
|
src_ptr+=src_pixels_per_line;
|
|
|
|
// save only 4 bytes
|
|
*((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
|
|
|
|
output_ptr+=output_pitch;
|
|
}
|
|
}
|
|
|
|
void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
|
|
unsigned int src_pixels_per_line,
|
|
unsigned char *output_ptr,
|
|
unsigned int output_pitch,
|
|
unsigned int output_height,
|
|
int16_t *filter) {
|
|
__m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
|
|
__m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
|
|
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
|
|
__m128i addFilterReg64, filtersReg, minReg;
|
|
unsigned int i;
|
|
|
|
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
|
|
addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
|
|
filtersReg = _mm_loadu_si128((__m128i *)filter);
|
|
// converting the 16 bit (short) to 8 bit (byte) and have the same data
|
|
// in both lanes of 128 bit register.
|
|
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
|
|
|
|
// duplicate only the first 16 bits (first and second byte)
|
|
// across 128 bit register
|
|
firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
|
|
// duplicate only the second 16 bits (third and forth byte)
|
|
// across 128 bit register
|
|
secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
|
|
// duplicate only the third 16 bits (fifth and sixth byte)
|
|
// across 128 bit register
|
|
thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
|
|
// duplicate only the forth 16 bits (seventh and eighth byte)
|
|
// across 128 bit register
|
|
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
|
|
|
|
filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
|
|
filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
|
|
filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
|
|
filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
|
|
|
|
for (i = 0; i < output_height; i++) {
|
|
srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
|
|
|
|
// filter the source buffer
|
|
srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
|
|
srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
|
|
|
|
// multiply 2 adjacent elements with the filter and add the result
|
|
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
|
|
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
|
|
|
|
// filter the source buffer
|
|
srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
|
|
srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
|
|
|
|
// multiply 2 adjacent elements with the filter and add the result
|
|
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
|
|
srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
|
|
|
|
// add and saturate all the results together
|
|
minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
|
|
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
|
|
|
|
srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
|
|
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
|
|
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
|
|
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
|
|
|
|
// shift by 7 bit each 16 bits
|
|
srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
|
|
|
|
// shrink to 8 bit each 16 bits
|
|
srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
|
|
|
|
src_ptr+=src_pixels_per_line;
|
|
|
|
// save only 8 bytes
|
|
_mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
|
|
|
|
output_ptr+=output_pitch;
|
|
}
|
|
}
|
|
|
|
void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
|
|
unsigned int src_pixels_per_line,
|
|
unsigned char *output_ptr,
|
|
unsigned int output_pitch,
|
|
unsigned int output_height,
|
|
int16_t *filter) {
|
|
__m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
|
|
__m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
|
|
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
|
|
__m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
|
|
unsigned int i;
|
|
|
|
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
|
|
addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
|
|
filtersReg = _mm_loadu_si128((__m128i *)filter);
|
|
// converting the 16 bit (short) to 8 bit (byte) and have the same data
|
|
// in both lanes of 128 bit register.
|
|
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
|
|
|
|
// duplicate only the first 16 bits (first and second byte)
|
|
// across 128 bit register
|
|
firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
|
|
// duplicate only the second 16 bits (third and forth byte)
|
|
// across 128 bit register
|
|
secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
|
|
// duplicate only the third 16 bits (fifth and sixth byte)
|
|
// across 128 bit register
|
|
thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
|
|
// duplicate only the forth 16 bits (seventh and eighth byte)
|
|
// across 128 bit register
|
|
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
|
|
|
|
filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
|
|
filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
|
|
filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
|
|
filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
|
|
|
|
for (i = 0; i < output_height; i++) {
|
|
srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
|
|
|
|
// filter the source buffer
|
|
srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
|
|
srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);
|
|
|
|
// multiply 2 adjacent elements with the filter and add the result
|
|
srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
|
|
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
|
|
|
|
// add and saturate the results together
|
|
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
|
|
|
|
// filter the source buffer
|
|
srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);
|
|
srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
|
|
|
|
// multiply 2 adjacent elements with the filter and add the result
|
|
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
|
|
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
|
|
|
|
// add and saturate the results together
|
|
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
|
|
_mm_min_epi16(srcRegFilt3, srcRegFilt2));
|
|
|
|
// reading the next 16 bytes.
|
|
// (part of it was being read by earlier read)
|
|
srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
|
|
|
|
// add and saturate the results together
|
|
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
|
|
_mm_max_epi16(srcRegFilt3, srcRegFilt2));
|
|
|
|
// filter the source buffer
|
|
srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
|
|
srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);
|
|
|
|
// multiply 2 adjacent elements with the filter and add the result
|
|
srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
|
|
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
|
|
|
|
// add and saturate the results together
|
|
srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
|
|
|
|
// filter the source buffer
|
|
srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg);
|
|
srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
|
|
|
|
// multiply 2 adjacent elements with the filter and add the result
|
|
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
|
|
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
|
|
|
|
// add and saturate the results together
|
|
srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
|
|
_mm_min_epi16(srcRegFilt3, srcRegFilt2));
|
|
srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
|
|
_mm_max_epi16(srcRegFilt3, srcRegFilt2));
|
|
|
|
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
|
|
srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
|
|
|
|
// shift by 7 bit each 16 bit
|
|
srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
|
|
srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
|
|
|
|
// shrink to 8 bit each 16 bits, the first lane contain the first
|
|
// convolve result and the second lane contain the second convolve
|
|
// result
|
|
srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
|
|
|
|
src_ptr+=src_pixels_per_line;
|
|
|
|
// save 16 bytes
|
|
_mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
|
|
|
|
output_ptr+=output_pitch;
|
|
}
|
|
}
|
|
|
|
void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
|
|
unsigned int src_pitch,
|
|
unsigned char *output_ptr,
|
|
unsigned int out_pitch,
|
|
unsigned int output_height,
|
|
int16_t *filter) {
|
|
__m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
|
|
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
|
|
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
|
|
unsigned int i;
|
|
|
|
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
|
|
addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
|
|
filtersReg = _mm_loadu_si128((__m128i *)filter);
|
|
// converting the 16 bit (short) to 8 bit (byte) and have the same data
|
|
// in both lanes of 128 bit register.
|
|
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
|
|
|
|
// duplicate only the first 16 bits in the filter
|
|
firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
|
|
// duplicate only the second 16 bits in the filter
|
|
secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
|
|
// duplicate only the third 16 bits in the filter
|
|
thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
|
|
// duplicate only the forth 16 bits in the filter
|
|
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
|
|
|
|
for (i = 0; i < output_height; i++) {
|
|
// load the first 8 bytes
|
|
srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
|
|
// load the next 8 bytes in stride of src_pitch
|
|
srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
|
|
srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
|
|
srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
|
|
|
|
// merge the result together
|
|
srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
|
|
srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
|
|
|
|
// load the next 8 bytes in stride of src_pitch
|
|
srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
|
|
srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
|
|
srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
|
|
srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
|
|
|
|
// merge the result together
|
|
srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
|
|
srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
|
|
|
|
// multiply 2 adjacent elements with the filter and add the result
|
|
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
|
|
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
|
|
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
|
|
srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
|
|
|
|
// add and saturate the results together
|
|
minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
|
|
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
|
|
srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
|
|
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
|
|
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
|
|
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
|
|
|
|
// shift by 7 bit each 16 bit
|
|
srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
|
|
|
|
// shrink to 8 bit each 16 bits
|
|
srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
|
|
|
|
src_ptr+=src_pitch;
|
|
|
|
// save only 8 bytes convolve result
|
|
_mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
|
|
|
|
output_ptr+=out_pitch;
|
|
}
|
|
}
|
|
|
|
void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
|
|
unsigned int src_pitch,
|
|
unsigned char *output_ptr,
|
|
unsigned int out_pitch,
|
|
unsigned int output_height,
|
|
int16_t *filter) {
|
|
__m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
|
|
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
|
|
__m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
|
|
unsigned int i;
|
|
|
|
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
|
|
addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
|
|
filtersReg = _mm_loadu_si128((__m128i *)filter);
|
|
// converting the 16 bit (short) to 8 bit (byte) and have the same data
|
|
// in both lanes of 128 bit register.
|
|
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
|
|
|
|
// duplicate only the first 16 bits in the filter
|
|
firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
|
|
// duplicate only the second 16 bits in the filter
|
|
secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
|
|
// duplicate only the third 16 bits in the filter
|
|
thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
|
|
// duplicate only the forth 16 bits in the filter
|
|
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
|
|
|
|
for (i = 0; i < output_height; i++) {
|
|
// load the first 16 bytes
|
|
srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
|
|
// load the next 16 bytes in stride of src_pitch
|
|
srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
|
|
srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
|
|
srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
|
|
|
|
// merge the result together
|
|
srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
|
|
srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
|
|
srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
|
|
srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
|
|
|
|
// multiply 2 adjacent elements with the filter and add the result
|
|
srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
|
|
srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
|
|
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
|
|
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
|
|
|
|
// add and saturate the results together
|
|
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
|
|
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
|
|
|
|
// load the next 16 bytes in stride of two/three src_pitch
|
|
srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
|
|
srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
|
|
|
|
// merge the result together
|
|
srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
|
|
srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
|
|
|
|
// multiply 2 adjacent elements with the filter and add the result
|
|
srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
|
|
srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
|
|
|
|
// load the next 16 bytes in stride of four/five src_pitch
|
|
srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
|
|
srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
|
|
|
|
// merge the result together
|
|
srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
|
|
srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
|
|
|
|
// multiply 2 adjacent elements with the filter and add the result
|
|
srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
|
|
srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
|
|
|
|
// add and saturate the results together
|
|
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
|
|
_mm_min_epi16(srcRegFilt4, srcRegFilt7));
|
|
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
|
|
_mm_min_epi16(srcRegFilt6, srcRegFilt8));
|
|
|
|
// add and saturate the results together
|
|
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
|
|
_mm_max_epi16(srcRegFilt4, srcRegFilt7));
|
|
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
|
|
_mm_max_epi16(srcRegFilt6, srcRegFilt8));
|
|
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
|
|
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
|
|
|
|
// shift by 7 bit each 16 bit
|
|
srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
|
|
srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
|
|
|
|
// shrink to 8 bit each 16 bits, the first lane contain the first
|
|
// convolve result and the second lane contain the second convolve
|
|
// result
|
|
srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
|
|
|
|
src_ptr+=src_pitch;
|
|
|
|
// save 16 bytes convolve result
|
|
_mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
|
|
|
|
output_ptr+=out_pitch;
|
|
}
|
|
}
|