vpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
Johann eb88b172fe Make vp9 subpixel match vp8
The only difference between the two was that the vp9 function allowed
for every step in the bilinear filter (16 steps) while vp8 only allowed
for half of those. Since all the call sites in vp9 (<< 1) the input, it
only ever used the same steps as vp8.

This will allow moving the subpel variance to vpx_dsp with the rest of
the variance functions.

Change-Id: I6fa2509350a2dc610c46b3e15bde98a15a084b75
2015-06-03 22:10:51 -07:00

526 lines
19 KiB
C

/*
* Copyright (c) 2012 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <immintrin.h> // AVX2
#include "./vp9_rtcd.h"
#include "vpx_ports/mem.h"
#include "vp9/encoder/vp9_variance.h"
DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
};
#define FILTER_SRC(filter) \
/* filter the source */ \
exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
\
/* add 8 to source */ \
exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
\
/* divide source by 16 */ \
exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
#define MERGE_WITH_SRC(src_reg, reg) \
exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
#define LOAD_SRC_DST \
/* load source and destination */ \
src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
#define AVG_NEXT_SRC(src_reg, size_stride) \
src_next_reg = _mm256_loadu_si256((__m256i const *) \
(src + size_stride)); \
/* average between current and next stride source */ \
src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
#define MERGE_NEXT_SRC(src_reg, size_stride) \
src_next_reg = _mm256_loadu_si256((__m256i const *) \
(src + size_stride)); \
MERGE_WITH_SRC(src_reg, src_next_reg)
#define CALC_SUM_SSE_INSIDE_LOOP \
/* expand each byte to 2 bytes */ \
exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
/* source - dest */ \
exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
/* caculate sum */ \
sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
/* calculate sse */ \
sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
// final calculation to sum and sse
#define CALC_SUM_AND_SSE \
res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
\
sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
\
sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
*((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
_mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
_mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
int src_stride,
int x_offset,
int y_offset,
const uint8_t *dst,
int dst_stride,
int height,
unsigned int *sse) {
__m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
__m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
__m256i zero_reg;
int i, sum;
sum_reg = _mm256_set1_epi16(0);
sse_reg = _mm256_set1_epi16(0);
zero_reg = _mm256_set1_epi16(0);
// x_offset = 0 and y_offset = 0
if (x_offset == 0) {
if (y_offset == 0) {
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
// expend each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = 0 and y_offset = 8
} else if (y_offset == 8) {
__m256i src_next_reg;
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, src_stride)
// expend each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = 0 and y_offset = bilin interpolation
} else {
__m256i filter, pw8, src_next_reg;
y_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, src_stride)
FILTER_SRC(filter)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
}
// x_offset = 8 and y_offset = 0
} else if (x_offset == 8) {
if (y_offset == 0) {
__m256i src_next_reg;
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = 8 and y_offset = 8
} else if (y_offset == 8) {
__m256i src_next_reg, src_avg;
// load source and another source starting from the next
// following byte
src_reg = _mm256_loadu_si256((__m256i const *) (src));
AVG_NEXT_SRC(src_reg, 1)
for (i = 0; i < height ; i++) {
src_avg = src_reg;
src+= src_stride;
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
// average between previous average to current average
src_avg = _mm256_avg_epu8(src_avg, src_reg);
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_avg, zero_reg)
// save current source average
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
// x_offset = 8 and y_offset = bilin interpolation
} else {
__m256i filter, pw8, src_next_reg, src_avg;
y_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
// load source and another source starting from the next
// following byte
src_reg = _mm256_loadu_si256((__m256i const *) (src));
AVG_NEXT_SRC(src_reg, 1)
for (i = 0; i < height ; i++) {
// save current source average
src_avg = src_reg;
src+= src_stride;
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
MERGE_WITH_SRC(src_avg, src_reg)
FILTER_SRC(filter)
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
}
// x_offset = bilin interpolation and y_offset = 0
} else {
if (y_offset == 0) {
__m256i filter, pw8, src_next_reg;
x_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + x_offset));
pw8 = _mm256_set1_epi16(8);
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = bilin interpolation and y_offset = 8
} else if (y_offset == 8) {
__m256i filter, pw8, src_next_reg, src_pack;
x_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + x_offset));
pw8 = _mm256_set1_epi16(8);
src_reg = _mm256_loadu_si256((__m256i const *) (src));
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
// convert each 16 bit to 8 bit to each low and high lane source
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
for (i = 0; i < height ; i++) {
src+= src_stride;
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
// average between previous pack to the current
src_pack = _mm256_avg_epu8(src_pack, src_reg);
MERGE_WITH_SRC(src_pack, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src_pack = src_reg;
dst+= dst_stride;
}
// x_offset = bilin interpolation and y_offset = bilin interpolation
} else {
__m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
x_offset <<= 5;
xfilter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + x_offset));
y_offset <<= 5;
yfilter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
// load source and another source starting from the next
// following byte
src_reg = _mm256_loadu_si256((__m256i const *) (src));
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(xfilter)
// convert each 16 bit to 8 bit to each low and high lane source
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
for (i = 0; i < height ; i++) {
src+= src_stride;
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(xfilter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
// merge previous pack to current pack source
MERGE_WITH_SRC(src_pack, src_reg)
// filter the source
FILTER_SRC(yfilter)
src_pack = src_reg;
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
}
}
CALC_SUM_AND_SSE
return sum;
}
unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
int src_stride,
int x_offset,
int y_offset,
const uint8_t *dst,
int dst_stride,
const uint8_t *sec,
int sec_stride,
int height,
unsigned int *sse) {
__m256i sec_reg;
__m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
__m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
__m256i zero_reg;
int i, sum;
sum_reg = _mm256_set1_epi16(0);
sse_reg = _mm256_set1_epi16(0);
zero_reg = _mm256_set1_epi16(0);
// x_offset = 0 and y_offset = 0
if (x_offset == 0) {
if (y_offset == 0) {
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride;
// expend each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
} else if (y_offset == 8) {
__m256i src_next_reg;
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, src_stride)
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride;
// expend each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = 0 and y_offset = bilin interpolation
} else {
__m256i filter, pw8, src_next_reg;
y_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, src_stride)
FILTER_SRC(filter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride;
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
}
// x_offset = 8 and y_offset = 0
} else if (x_offset == 8) {
if (y_offset == 0) {
__m256i src_next_reg;
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride;
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = 8 and y_offset = 8
} else if (y_offset == 8) {
__m256i src_next_reg, src_avg;
// load source and another source starting from the next
// following byte
src_reg = _mm256_loadu_si256((__m256i const *) (src));
AVG_NEXT_SRC(src_reg, 1)
for (i = 0; i < height ; i++) {
// save current source average
src_avg = src_reg;
src+= src_stride;
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
// average between previous average to current average
src_avg = _mm256_avg_epu8(src_avg, src_reg);
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_avg = _mm256_avg_epu8(src_avg, sec_reg);
sec+= sec_stride;
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_avg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
// x_offset = 8 and y_offset = bilin interpolation
} else {
__m256i filter, pw8, src_next_reg, src_avg;
y_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
// load source and another source starting from the next
// following byte
src_reg = _mm256_loadu_si256((__m256i const *) (src));
AVG_NEXT_SRC(src_reg, 1)
for (i = 0; i < height ; i++) {
// save current source average
src_avg = src_reg;
src+= src_stride;
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
MERGE_WITH_SRC(src_avg, src_reg)
FILTER_SRC(filter)
src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_avg = _mm256_avg_epu8(src_avg, sec_reg);
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_avg, zero_reg)
sec+= sec_stride;
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
}
// x_offset = bilin interpolation and y_offset = 0
} else {
if (y_offset == 0) {
__m256i filter, pw8, src_next_reg;
x_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + x_offset));
pw8 = _mm256_set1_epi16(8);
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
MERGE_WITH_SRC(src_reg, zero_reg)
sec+= sec_stride;
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = bilin interpolation and y_offset = 8
} else if (y_offset == 8) {
__m256i filter, pw8, src_next_reg, src_pack;
x_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + x_offset));
pw8 = _mm256_set1_epi16(8);
src_reg = _mm256_loadu_si256((__m256i const *) (src));
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
// convert each 16 bit to 8 bit to each low and high lane source
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
for (i = 0; i < height ; i++) {
src+= src_stride;
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
// average between previous pack to the current
src_pack = _mm256_avg_epu8(src_pack, src_reg);
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_pack = _mm256_avg_epu8(src_pack, sec_reg);
sec+= sec_stride;
MERGE_WITH_SRC(src_pack, zero_reg)
src_pack = src_reg;
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
// x_offset = bilin interpolation and y_offset = bilin interpolation
} else {
__m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
x_offset <<= 5;
xfilter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + x_offset));
y_offset <<= 5;
yfilter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
// load source and another source starting from the next
// following byte
src_reg = _mm256_loadu_si256((__m256i const *) (src));
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(xfilter)
// convert each 16 bit to 8 bit to each low and high lane source
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
for (i = 0; i < height ; i++) {
src+= src_stride;
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(xfilter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
// merge previous pack to current pack source
MERGE_WITH_SRC(src_pack, src_reg)
// filter the source
FILTER_SRC(yfilter)
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_pack = _mm256_avg_epu8(src_pack, sec_reg);
MERGE_WITH_SRC(src_pack, zero_reg)
src_pack = src_reg;
sec+= sec_stride;
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
}
}
CALC_SUM_AND_SSE
return sum;
}