Make horiz4 interface consistent with horizP

This commit is contained in:
Yi Luo 2016-06-06 17:38:05 -07:00
parent ee46fc0e8a
commit 53d3fd718a

View File

@ -321,8 +321,9 @@ static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
} }
static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
uint8_t *dst, const struct Filter filter) { __m128i *f, int tapsNum, uint8_t *dst) {
const __m128i k_256 = _mm_set1_epi16(1 << 8); const __m128i k_256 = _mm_set1_epi16(1 << 8);
#if 0
// pack and duplicate the filter values // pack and duplicate the filter values
const __m128i f1f0 = *((__m128i *)(filter.coeffs + 0)); const __m128i f1f0 = *((__m128i *)(filter.coeffs + 0));
const __m128i f3f2 = *((__m128i *)(filter.coeffs + 1)); const __m128i f3f2 = *((__m128i *)(filter.coeffs + 1));
@ -330,7 +331,8 @@ static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
const __m128i f7f6 = *((__m128i *)(filter.coeffs + 3)); const __m128i f7f6 = *((__m128i *)(filter.coeffs + 3));
const __m128i f9f8 = *((__m128i *)(filter.coeffs + 4)); const __m128i f9f8 = *((__m128i *)(filter.coeffs + 4));
const __m128i fbfa = *((__m128i *)(filter.coeffs + 5)); const __m128i fbfa = *((__m128i *)(filter.coeffs + 5));
if (filter.tapsNum == 10) { #endif
if (tapsNum == 10) {
src_ptr -= 1; src_ptr -= 1;
} }
const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr); const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
@ -373,12 +375,12 @@ static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
const __m128i sbsa = _mm_srli_si128(s9s8, 8); const __m128i sbsa = _mm_srli_si128(s9s8, 8);
// multiply 2 adjacent elements with the filter and add the result // multiply 2 adjacent elements with the filter and add the result
const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); const __m128i x0 = _mm_maddubs_epi16(s1s0, f[0]);
const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); const __m128i x1 = _mm_maddubs_epi16(s3s2, f[1]);
const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); const __m128i x2 = _mm_maddubs_epi16(s5s4, f[2]);
const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); const __m128i x3 = _mm_maddubs_epi16(s7s6, f[3]);
const __m128i x4 = _mm_maddubs_epi16(s9s8, f9f8); const __m128i x4 = _mm_maddubs_epi16(s9s8, f[4]);
const __m128i x5 = _mm_maddubs_epi16(sbsa, fbfa); const __m128i x5 = _mm_maddubs_epi16(sbsa, f[5]);
// add and saturate the results together // add and saturate the results together
const __m128i min_x2x3 = _mm_min_epi16(x2, x3); const __m128i min_x2x3 = _mm_min_epi16(x2, x3);
const __m128i max_x2x3 = _mm_max_epi16(x2, x3); const __m128i max_x2x3 = _mm_max_epi16(x2, x3);
@ -436,17 +438,31 @@ void run_target_filter(uint8_t *src, int width, int height, int stride,
void run_subpixel_filter(uint8_t *src, int width, int height, int stride, void run_subpixel_filter(uint8_t *src, int width, int height, int stride,
const struct Filter filter, uint8_t *dst) { const struct Filter filter, uint8_t *dst) {
uint8_t temp[4 * 4] __attribute__ ((aligned(16))); uint8_t temp[4 * 4] __attribute__ ((aligned(16)));
uint8_t *src_ptr = src; __m128i f[6];
int tapsNum;
uint8_t *src_ptr;
uint32_t start, end; uint32_t start, end;
int count = 0; int count;
int block_height = height >> 2; int block_height;
int col, i; int col, i;
start = readtsc(); start = readtsc();
tapsNum = filter.tapsNum;
count = 0;
block_height = height >> 2;
src_ptr = src;
f[0] = *((__m128i *)(filter.coeffs));
f[1] = *((__m128i *)(filter.coeffs + 1));
f[2] = *((__m128i *)(filter.coeffs + 2));
f[3] = *((__m128i *)(filter.coeffs + 3));
f[4] = *((__m128i *)(filter.coeffs + 4));
f[5] = *((__m128i *)(filter.coeffs + 5));
do { do {
for (col = 0; col < width; col += 4) { for (col = 0; col < width; col += 4) {
for (i = 0; i < 4; ++i) { for (i = 0; i < 4; ++i) {
filter_horiz_w4_ssse3(src_ptr, stride, temp + (i * 4), filter); filter_horiz_w4_ssse3(src_ptr, stride, f, tapsNum, temp + (i * 4));
src_ptr += 1; src_ptr += 1;
} }
transpose4x4_to_dst(temp, 4, dst + col, stride); transpose4x4_to_dst(temp, 4, dst + col, stride);