In 8-tap filtering, to guarantee the intermediate results fit in 16 bits, the order of accumulating the products needs to be done correctly, and the largest product should be added last. This patch fixed the problem using the method in commit "Correct ssse3 8/16-pixel wide sub-pixel filter calculation". Change-Id: I79d0ad60c057b15011ece84cda9648eee0809423
493 lines
21 KiB
493 lines
21 KiB
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
#include <tmmintrin.h>
#include "vpx_ports/mem.h"
#include "vpx_ports/emmintrin_compat.h"
// filters only for the 4_h8 convolution
DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
// filters for 8_h8 and 16_h8
DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
unsigned int src_pixels_per_line,
unsigned char *output_ptr,
unsigned int output_pitch,
unsigned int output_height,
int16_t *filter) {
__m128i firstFilters, secondFilters, shuffle1, shuffle2;
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
__m128i addFilterReg64, filtersReg, srcReg, minReg;
unsigned int i;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
filtersReg = _mm_loadu_si128((__m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
// duplicate only the first 16 bits in the filter into the first lane
firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
// duplicate only the third 16 bit in the filter into the first lane
secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
// duplicate only the seconds 16 bits in the filter into the second lane
// firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
// duplicate only the forth 16 bits in the filter into the second lane
// secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
// loading the local filters
shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
for (i = 0; i < output_height; i++) {
srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
// filter the source buffer
srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
// extract the higher half of the lane
srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
// add and saturate all the results together
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
// shift by 7 bit each 16 bits
srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
// shrink to 8 bit each 16 bits
srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
// save only 4 bytes
*((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
unsigned int src_pixels_per_line,
unsigned char *output_ptr,
unsigned int output_pitch,
unsigned int output_height,
int16_t *filter) {
__m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
__m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
__m128i addFilterReg64, filtersReg, minReg;
unsigned int i;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
filtersReg = _mm_loadu_si128((__m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
// duplicate only the first 16 bits (first and second byte)
// across 128 bit register
firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
// duplicate only the second 16 bits (third and forth byte)
// across 128 bit register
secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
// duplicate only the third 16 bits (fifth and sixth byte)
// across 128 bit register
thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
// duplicate only the forth 16 bits (seventh and eighth byte)
// across 128 bit register
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
for (i = 0; i < output_height; i++) {
srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
// filter the source buffer
srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
// filter the source buffer
srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
// add and saturate all the results together
minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
// shift by 7 bit each 16 bits
srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
// shrink to 8 bit each 16 bits
srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
// save only 8 bytes
_mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
unsigned int src_pixels_per_line,
unsigned char *output_ptr,
unsigned int output_pitch,
unsigned int output_height,
int16_t *filter) {
__m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
__m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
__m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
unsigned int i;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
filtersReg = _mm_loadu_si128((__m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
// duplicate only the first 16 bits (first and second byte)
// across 128 bit register
firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
// duplicate only the second 16 bits (third and forth byte)
// across 128 bit register
secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
// duplicate only the third 16 bits (fifth and sixth byte)
// across 128 bit register
thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
// duplicate only the forth 16 bits (seventh and eighth byte)
// across 128 bit register
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
for (i = 0; i < output_height; i++) {
srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
// filter the source buffer
srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
// add and saturate the results together
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
// filter the source buffer
srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);
srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
// add and saturate the results together
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
_mm_min_epi16(srcRegFilt3, srcRegFilt2));
// reading the next 16 bytes.
// (part of it was being read by earlier read)
srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
// add and saturate the results together
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
_mm_max_epi16(srcRegFilt3, srcRegFilt2));
// filter the source buffer
srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
// add and saturate the results together
srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
// filter the source buffer
srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg);
srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
// add and saturate the results together
srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
_mm_min_epi16(srcRegFilt3, srcRegFilt2));
srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
_mm_max_epi16(srcRegFilt3, srcRegFilt2));
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
// shift by 7 bit each 16 bit
srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
// shrink to 8 bit each 16 bits, the first lane contain the first
// convolve result and the second lane contain the second convolve
// result
srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
// save 16 bytes
_mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
int16_t *filter) {
__m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
unsigned int i;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
filtersReg = _mm_loadu_si128((__m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
// duplicate only the first 16 bits in the filter
firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
// duplicate only the second 16 bits in the filter
secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
// duplicate only the third 16 bits in the filter
thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
// duplicate only the forth 16 bits in the filter
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
for (i = 0; i < output_height; i++) {
// load the first 8 bytes
srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
// load the next 8 bytes in stride of src_pitch
srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
// merge the result together
srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
// load the next 8 bytes in stride of src_pitch
srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
// merge the result together
srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
// add and saturate the results together
minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
// shift by 7 bit each 16 bit
srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
// shrink to 8 bit each 16 bits
srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
// save only 8 bytes convolve result
_mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
int16_t *filter) {
__m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
__m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
unsigned int i;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
filtersReg = _mm_loadu_si128((__m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
// duplicate only the first 16 bits in the filter
firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
// duplicate only the second 16 bits in the filter
secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
// duplicate only the third 16 bits in the filter
thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
// duplicate only the forth 16 bits in the filter
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
for (i = 0; i < output_height; i++) {
// load the first 16 bytes
srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
// load the next 16 bytes in stride of src_pitch
srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
// merge the result together
srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
// add and saturate the results together
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
// load the next 16 bytes in stride of two/three src_pitch
srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
// merge the result together
srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
// load the next 16 bytes in stride of four/five src_pitch
srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
// merge the result together
srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
// add and saturate the results together
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
_mm_min_epi16(srcRegFilt4, srcRegFilt7));
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
_mm_min_epi16(srcRegFilt6, srcRegFilt8));
// add and saturate the results together
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
_mm_max_epi16(srcRegFilt4, srcRegFilt7));
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
_mm_max_epi16(srcRegFilt6, srcRegFilt8));
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
// shift by 7 bit each 16 bit
srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
// shrink to 8 bit each 16 bits, the first lane contain the first
// convolve result and the second lane contain the second convolve
// result
srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
// save 16 bytes convolve result
_mm_store_si128((__m128i*)output_ptr, srcRegFilt1);