Merge "Optimize convolve8 SSSE3 and AVX2 intrinsics"
This commit is contained in:
@@ -603,6 +603,75 @@ TEST_P(ConvolveTest, DISABLED_Scale_Speed) {
|
|||||||
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
|
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_P(ConvolveTest, DISABLED_8Tap_Speed) {
|
||||||
|
const uint8_t *const in = input();
|
||||||
|
uint8_t *const out = output();
|
||||||
|
const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
|
||||||
|
const int kNumTests = 5000000;
|
||||||
|
const int width = Width();
|
||||||
|
const int height = Height();
|
||||||
|
vpx_usec_timer timer;
|
||||||
|
|
||||||
|
SetConstantInput(127);
|
||||||
|
|
||||||
|
vpx_usec_timer_start(&timer);
|
||||||
|
for (int n = 0; n < kNumTests; ++n) {
|
||||||
|
UUT_->hv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
|
||||||
|
width, height);
|
||||||
|
}
|
||||||
|
vpx_usec_timer_mark(&timer);
|
||||||
|
|
||||||
|
const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
|
||||||
|
printf("convolve8_%dx%d_%d: %d us\n", width, height,
|
||||||
|
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_P(ConvolveTest, DISABLED_8Tap_Horiz_Speed) {
|
||||||
|
const uint8_t *const in = input();
|
||||||
|
uint8_t *const out = output();
|
||||||
|
const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
|
||||||
|
const int kNumTests = 5000000;
|
||||||
|
const int width = Width();
|
||||||
|
const int height = Height();
|
||||||
|
vpx_usec_timer timer;
|
||||||
|
|
||||||
|
SetConstantInput(127);
|
||||||
|
|
||||||
|
vpx_usec_timer_start(&timer);
|
||||||
|
for (int n = 0; n < kNumTests; ++n) {
|
||||||
|
UUT_->h8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
|
||||||
|
width, height);
|
||||||
|
}
|
||||||
|
vpx_usec_timer_mark(&timer);
|
||||||
|
|
||||||
|
const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
|
||||||
|
printf("convolve8_horiz_%dx%d_%d: %d us\n", width, height,
|
||||||
|
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_P(ConvolveTest, DISABLED_8Tap_Vert_Speed) {
|
||||||
|
const uint8_t *const in = input();
|
||||||
|
uint8_t *const out = output();
|
||||||
|
const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
|
||||||
|
const int kNumTests = 5000000;
|
||||||
|
const int width = Width();
|
||||||
|
const int height = Height();
|
||||||
|
vpx_usec_timer timer;
|
||||||
|
|
||||||
|
SetConstantInput(127);
|
||||||
|
|
||||||
|
vpx_usec_timer_start(&timer);
|
||||||
|
for (int n = 0; n < kNumTests; ++n) {
|
||||||
|
UUT_->v8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
|
||||||
|
width, height);
|
||||||
|
}
|
||||||
|
vpx_usec_timer_mark(&timer);
|
||||||
|
|
||||||
|
const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
|
||||||
|
printf("convolve8_vert_%dx%d_%d: %d us\n", width, height,
|
||||||
|
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
|
||||||
|
}
|
||||||
|
|
||||||
TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) {
|
TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) {
|
||||||
const uint8_t *const in = input();
|
const uint8_t *const in = input();
|
||||||
uint8_t *const out = output();
|
uint8_t *const out = output();
|
||||||
|
@@ -58,16 +58,19 @@ static INLINE __m256i convolve8_16_avx2(const __m256i *const s,
|
|||||||
const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]);
|
const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]);
|
||||||
const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]);
|
const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]);
|
||||||
const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]);
|
const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]);
|
||||||
// add and saturate the results together
|
__m256i sum1, sum2;
|
||||||
const __m256i min_x2x1 = _mm256_min_epi16(x2, x1);
|
|
||||||
const __m256i max_x2x1 = _mm256_max_epi16(x2, x1);
|
// sum the results together, saturating only on the final step
|
||||||
__m256i temp = _mm256_adds_epi16(x0, x3);
|
// adding x0 with x2 and x1 with x3 is the only order that prevents
|
||||||
temp = _mm256_adds_epi16(temp, min_x2x1);
|
// outranges for all filters
|
||||||
temp = _mm256_adds_epi16(temp, max_x2x1);
|
sum1 = _mm256_add_epi16(x0, x2);
|
||||||
|
sum2 = _mm256_add_epi16(x1, x3);
|
||||||
|
// add the rounding offset early to avoid another saturated add
|
||||||
|
sum1 = _mm256_add_epi16(sum1, k_64);
|
||||||
|
sum1 = _mm256_adds_epi16(sum1, sum2);
|
||||||
// round and shift by 7 bit each 16 bit
|
// round and shift by 7 bit each 16 bit
|
||||||
temp = _mm256_adds_epi16(temp, k_64);
|
sum1 = _mm256_srai_epi16(sum1, 7);
|
||||||
temp = _mm256_srai_epi16(temp, 7);
|
return sum1;
|
||||||
return temp;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
|
static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
|
||||||
@@ -82,16 +85,19 @@ static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
|
|||||||
_mm256_castsi256_si128(f[2]));
|
_mm256_castsi256_si128(f[2]));
|
||||||
const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]),
|
const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]),
|
||||||
_mm256_castsi256_si128(f[3]));
|
_mm256_castsi256_si128(f[3]));
|
||||||
// add and saturate the results together
|
__m128i sum1, sum2;
|
||||||
const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
|
|
||||||
const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
|
// sum the results together, saturating only on the final step
|
||||||
__m128i temp = _mm_adds_epi16(x0, x3);
|
// adding x0 with x2 and x1 with x3 is the only order that prevents
|
||||||
temp = _mm_adds_epi16(temp, min_x2x1);
|
// outranges for all filters
|
||||||
temp = _mm_adds_epi16(temp, max_x2x1);
|
sum1 = _mm_add_epi16(x0, x2);
|
||||||
// round and shift by 7 bit each 16 bit
|
sum2 = _mm_add_epi16(x1, x3);
|
||||||
temp = _mm_adds_epi16(temp, k_64);
|
// add the rounding offset early to avoid another saturated add
|
||||||
temp = _mm_srai_epi16(temp, 7);
|
sum1 = _mm_add_epi16(sum1, k_64);
|
||||||
return temp;
|
sum1 = _mm_adds_epi16(sum1, sum2);
|
||||||
|
// shift by 7 bit each 16 bit
|
||||||
|
sum1 = _mm_srai_epi16(sum1, 7);
|
||||||
|
return sum1;
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef MM256_BROADCASTSI128_SI256
|
#undef MM256_BROADCASTSI128_SI256
|
||||||
|
@@ -48,16 +48,19 @@ static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
|
|||||||
const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
|
const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
|
||||||
const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
|
const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
|
||||||
const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
|
const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
|
||||||
// add and saturate the results together
|
__m128i sum1, sum2;
|
||||||
const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
|
|
||||||
const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
|
// sum the results together, saturating only on the final step
|
||||||
__m128i temp = _mm_adds_epi16(x0, x3);
|
// adding x0 with x2 and x1 with x3 is the only order that prevents
|
||||||
temp = _mm_adds_epi16(temp, min_x2x1);
|
// outranges for all filters
|
||||||
temp = _mm_adds_epi16(temp, max_x2x1);
|
sum1 = _mm_add_epi16(x0, x2);
|
||||||
// round and shift by 7 bit each 16 bit
|
sum2 = _mm_add_epi16(x1, x3);
|
||||||
temp = _mm_adds_epi16(temp, k_64);
|
// add the rounding offset early to avoid another saturated add
|
||||||
temp = _mm_srai_epi16(temp, 7);
|
sum1 = _mm_add_epi16(sum1, k_64);
|
||||||
return temp;
|
sum1 = _mm_adds_epi16(sum1, sum2);
|
||||||
|
// shift by 7 bit each 16 bit
|
||||||
|
sum1 = _mm_srai_epi16(sum1, 7);
|
||||||
|
return sum1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
|
static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
|
||||||
|
@@ -38,8 +38,8 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
|
|||||||
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
|
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
|
||||||
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
|
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
|
||||||
__m128i firstFilters, secondFilters, shuffle1, shuffle2;
|
__m128i firstFilters, secondFilters, shuffle1, shuffle2;
|
||||||
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
|
__m128i srcRegFilt1, srcRegFilt2;
|
||||||
__m128i addFilterReg64, filtersReg, srcReg, minReg;
|
__m128i addFilterReg64, filtersReg, srcReg;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
|
|
||||||
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
|
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
|
||||||
@@ -75,18 +75,16 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
|
|||||||
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
|
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
|
||||||
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
|
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
|
||||||
|
|
||||||
// extract the higher half of the lane
|
// sum the results together, saturating only on the final step
|
||||||
srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
|
// the specific order of the additions prevents outranges
|
||||||
srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
|
srcRegFilt1 = _mm_add_epi16(srcRegFilt1, srcRegFilt2);
|
||||||
|
|
||||||
minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
|
// extract the higher half of the register
|
||||||
|
srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);
|
||||||
|
|
||||||
// add and saturate all the results together
|
// add the rounding offset early to avoid another saturated add
|
||||||
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
|
srcRegFilt1 = _mm_add_epi16(srcRegFilt1, addFilterReg64);
|
||||||
srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
|
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
|
||||||
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
|
|
||||||
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
|
|
||||||
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
|
|
||||||
|
|
||||||
// shift by 7 bit each 16 bits
|
// shift by 7 bit each 16 bits
|
||||||
srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
|
srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
|
||||||
|
Reference in New Issue
Block a user