
Properly handle the case where the height is an integer multiple of 4. Change-Id: I11ac188c13f78db20902e2e333c60ce76ce837c5
478 lines
15 KiB
C
478 lines
15 KiB
C
/*
|
|
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#include <smmintrin.h>
|
|
|
|
#include "./vp10_rtcd.h"
|
|
#include "vp10/common/filter.h"
|
|
|
|
typedef void (*TransposeSave)(const int width, int pixelsNum,
|
|
uint32_t *src, int src_stride,
|
|
uint16_t *dst, int dst_stride,
|
|
int bd);
|
|
|
|
// pixelsNum 0: write all 4 pixels
|
|
// 1/2/3: residual pixels 1/2/3
|
|
static void writePixel(__m128i *u, int width, int pixelsNum,
|
|
uint16_t *dst, int dst_stride) {
|
|
if (2 == width) {
|
|
if (0 == pixelsNum) {
|
|
*(int *)dst = _mm_cvtsi128_si32(u[0]);
|
|
*(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
|
|
*(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
|
|
*(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]);
|
|
} else if (1 == pixelsNum) {
|
|
*(int *)dst = _mm_cvtsi128_si32(u[0]);
|
|
} else if (2 == pixelsNum) {
|
|
*(int *)dst = _mm_cvtsi128_si32(u[0]);
|
|
*(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
|
|
} else if (3 == pixelsNum) {
|
|
*(int *)dst = _mm_cvtsi128_si32(u[0]);
|
|
*(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
|
|
*(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
|
|
}
|
|
} else {
|
|
if (0 == pixelsNum) {
|
|
_mm_storel_epi64((__m128i *)dst, u[0]);
|
|
_mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
|
|
_mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
|
|
_mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]);
|
|
} else if (1 == pixelsNum) {
|
|
_mm_storel_epi64((__m128i *)dst, u[0]);
|
|
} else if (2 == pixelsNum) {
|
|
_mm_storel_epi64((__m128i *)dst, u[0]);
|
|
_mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
|
|
} else if (3 == pixelsNum) {
|
|
_mm_storel_epi64((__m128i *)dst, u[0]);
|
|
_mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
|
|
_mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
|
|
}
|
|
}
|
|
}
|
|
|
|
// 16-bit pixels clip with bd (10/12)
|
|
static void highbd_clip(__m128i *p, int numVecs, int bd) {
|
|
const __m128i zero = _mm_setzero_si128();
|
|
const __m128i one = _mm_set1_epi16(1);
|
|
const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
|
|
__m128i clamped, mask;
|
|
int i;
|
|
|
|
for (i = 0; i < numVecs; i++) {
|
|
mask = _mm_cmpgt_epi16(p[i], max);
|
|
clamped = _mm_andnot_si128(mask, p[i]);
|
|
mask = _mm_and_si128(mask, max);
|
|
clamped = _mm_or_si128(mask, clamped);
|
|
mask = _mm_cmpgt_epi16(clamped, zero);
|
|
p[i] = _mm_and_si128(clamped, mask);
|
|
}
|
|
}
|
|
|
|
static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) {
|
|
__m128i v0, v1;
|
|
__m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
|
|
|
|
u[0] = _mm_loadu_si128((__m128i const *)src);
|
|
u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
|
|
u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
|
|
u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
|
|
|
|
u[0] = _mm_add_epi32(u[0], rnd);
|
|
u[1] = _mm_add_epi32(u[1], rnd);
|
|
u[2] = _mm_add_epi32(u[2], rnd);
|
|
u[3] = _mm_add_epi32(u[3], rnd);
|
|
|
|
u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
|
|
u[1] = _mm_srai_epi32(u[1], FILTER_BITS);
|
|
u[2] = _mm_srai_epi32(u[2], FILTER_BITS);
|
|
u[3] = _mm_srai_epi32(u[3], FILTER_BITS);
|
|
|
|
u[0] = _mm_packus_epi32(u[0], u[1]);
|
|
u[1] = _mm_packus_epi32(u[2], u[3]);
|
|
|
|
highbd_clip(u, 2, bd);
|
|
|
|
v0 = _mm_unpacklo_epi16(u[0], u[1]);
|
|
v1 = _mm_unpackhi_epi16(u[0], u[1]);
|
|
|
|
u[0] = _mm_unpacklo_epi16(v0, v1);
|
|
u[2] = _mm_unpackhi_epi16(v0, v1);
|
|
|
|
u[1] = _mm_srli_si128(u[0], 8);
|
|
u[3] = _mm_srli_si128(u[2], 8);
|
|
}
|
|
|
|
// pixelsNum = 0 : all 4 rows of pixels will be saved.
|
|
// pixelsNum = 1/2/3 : residual 1/2/4 rows of pixels will be saved.
|
|
void trans_save_4x4(const int width, int pixelsNum,
|
|
uint32_t *src, int src_stride,
|
|
uint16_t *dst, int dst_stride,
|
|
int bd) {
|
|
__m128i u[4];
|
|
transClipPixel(src, src_stride, u, bd);
|
|
writePixel(u, width, pixelsNum, dst, dst_stride);
|
|
}
|
|
|
|
void trans_accum_save_4x4(const int width, int pixelsNum,
|
|
uint32_t *src, int src_stride,
|
|
uint16_t *dst, int dst_stride,
|
|
int bd) {
|
|
__m128i u[4], v[4];
|
|
const __m128i ones = _mm_set1_epi16(1);
|
|
|
|
transClipPixel(src, src_stride, u, bd);
|
|
|
|
v[0] = _mm_loadl_epi64((__m128i const *)dst);
|
|
v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
|
|
v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
|
|
v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
|
|
|
|
u[0] = _mm_add_epi16(u[0], v[0]);
|
|
u[1] = _mm_add_epi16(u[1], v[1]);
|
|
u[2] = _mm_add_epi16(u[2], v[2]);
|
|
u[3] = _mm_add_epi16(u[3], v[3]);
|
|
|
|
u[0] = _mm_add_epi16(u[0], ones);
|
|
u[1] = _mm_add_epi16(u[1], ones);
|
|
u[2] = _mm_add_epi16(u[2], ones);
|
|
u[3] = _mm_add_epi16(u[3], ones);
|
|
|
|
u[0] = _mm_srai_epi16(u[0], 1);
|
|
u[1] = _mm_srai_epi16(u[1], 1);
|
|
u[2] = _mm_srai_epi16(u[2], 1);
|
|
u[3] = _mm_srai_epi16(u[3], 1);
|
|
|
|
writePixel(u, width, pixelsNum, dst, dst_stride);
|
|
}
|
|
|
|
static TransposeSave transSaveTab[2] = {
|
|
trans_save_4x4, trans_accum_save_4x4};
|
|
|
|
static INLINE void transpose_pair(__m128i *in, __m128i *out) {
|
|
__m128i x0, x1;
|
|
|
|
x0 = _mm_unpacklo_epi32(in[0], in[1]);
|
|
x1 = _mm_unpacklo_epi32(in[2], in[3]);
|
|
|
|
out[0] = _mm_unpacklo_epi64(x0, x1);
|
|
out[1] = _mm_unpackhi_epi64(x0, x1);
|
|
|
|
x0 = _mm_unpackhi_epi32(in[0], in[1]);
|
|
x1 = _mm_unpackhi_epi32(in[2], in[3]);
|
|
|
|
out[2] = _mm_unpacklo_epi64(x0, x1);
|
|
out[3] = _mm_unpackhi_epi64(x0, x1);
|
|
|
|
x0 = _mm_unpacklo_epi32(in[4], in[5]);
|
|
x1 = _mm_unpacklo_epi32(in[6], in[7]);
|
|
|
|
out[4] = _mm_unpacklo_epi64(x0, x1);
|
|
out[5] = _mm_unpackhi_epi64(x0, x1);
|
|
}
|
|
|
|
static void highbd_filter_horiz(const uint16_t *src, int src_stride,
|
|
__m128i *f, int tapsNum, uint32_t *buf) {
|
|
__m128i u[8], v[6];
|
|
|
|
if (tapsNum == 10) {
|
|
src -= 1;
|
|
}
|
|
|
|
u[0] = _mm_loadu_si128((__m128i const *)src);
|
|
u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
|
|
u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
|
|
u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
|
|
|
|
u[4] = _mm_loadu_si128((__m128i const *)(src + 8));
|
|
u[5] = _mm_loadu_si128((__m128i const *)(src + src_stride + 8));
|
|
u[6] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride + 8));
|
|
u[7] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride + 8));
|
|
|
|
transpose_pair(u, v);
|
|
|
|
u[0] = _mm_madd_epi16(v[0], f[0]);
|
|
u[1] = _mm_madd_epi16(v[1], f[1]);
|
|
u[2] = _mm_madd_epi16(v[2], f[2]);
|
|
u[3] = _mm_madd_epi16(v[3], f[3]);
|
|
u[4] = _mm_madd_epi16(v[4], f[4]);
|
|
u[5] = _mm_madd_epi16(v[5], f[5]);
|
|
|
|
u[6] = _mm_min_epi32(u[2], u[3]);
|
|
u[7] = _mm_max_epi32(u[2], u[3]);
|
|
|
|
u[0] = _mm_add_epi32(u[0], u[1]);
|
|
u[0] = _mm_add_epi32(u[0], u[5]);
|
|
u[0] = _mm_add_epi32(u[0], u[4]);
|
|
u[0] = _mm_add_epi32(u[0], u[6]);
|
|
u[0] = _mm_add_epi32(u[0], u[7]);
|
|
|
|
_mm_storeu_si128((__m128i *)buf, u[0]);
|
|
}
|
|
|
|
void vp10_highbd_convolve_horiz_sse4_1(const uint16_t *src, int src_stride,
|
|
uint16_t *dst, int dst_stride,
|
|
int w, int h,
|
|
const InterpFilterParams filter_params,
|
|
const int subpel_x_q4, int x_step_q4,
|
|
int avg, int bd) {
|
|
DECLARE_ALIGNED(16, uint32_t, temp[4 * 4]);
|
|
__m128i verf[6];
|
|
HbdSubpelFilterCoeffs vCoeffs;
|
|
const uint16_t *srcPtr;
|
|
const int tapsNum = filter_params.taps;
|
|
int i, col, count, blkResidu, blkHeight;
|
|
TransposeSave transSave = transSaveTab[avg];
|
|
(void)x_step_q4;
|
|
|
|
if (0 == subpel_x_q4 || 16 != x_step_q4) {
|
|
vp10_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
|
|
filter_params, subpel_x_q4, x_step_q4, avg,
|
|
bd);
|
|
return;
|
|
}
|
|
|
|
vCoeffs = vp10_hbd_get_subpel_filter_ver_signal_dir(
|
|
filter_params, subpel_x_q4 - 1);
|
|
if (!vCoeffs) {
|
|
vp10_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
|
|
filter_params, subpel_x_q4, x_step_q4, avg,
|
|
bd);
|
|
return;
|
|
}
|
|
|
|
verf[0] = *((const __m128i *)(vCoeffs));
|
|
verf[1] = *((const __m128i *)(vCoeffs + 1));
|
|
verf[2] = *((const __m128i *)(vCoeffs + 2));
|
|
verf[3] = *((const __m128i *)(vCoeffs + 3));
|
|
verf[4] = *((const __m128i *)(vCoeffs + 4));
|
|
verf[5] = *((const __m128i *)(vCoeffs + 5));
|
|
|
|
src -= (tapsNum >> 1) - 1;
|
|
srcPtr = src;
|
|
|
|
count = 0;
|
|
blkHeight = h >> 2;
|
|
blkResidu = h & 3;
|
|
|
|
while (blkHeight != 0) {
|
|
for (col = 0; col < w; col += 4) {
|
|
for (i = 0; i < 4; ++i) {
|
|
highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
|
|
srcPtr += 1;
|
|
}
|
|
transSave(w, 0, temp, 4, dst + col, dst_stride, bd);
|
|
}
|
|
count++;
|
|
srcPtr = src + count * src_stride * 4;
|
|
dst += dst_stride * 4;
|
|
blkHeight--;
|
|
}
|
|
|
|
if (blkResidu == 0)
|
|
return;
|
|
|
|
for (col = 0; col < w; col += 4) {
|
|
for (i = 0; i < 4; ++i) {
|
|
highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
|
|
srcPtr += 1;
|
|
}
|
|
transSave(w, blkResidu, temp, 4, dst + col, dst_stride, bd);
|
|
}
|
|
}
|
|
|
|
// Vertical convolutional filter
|
|
|
|
typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst);
|
|
|
|
static void highbdRndingPacks(__m128i *u) {
|
|
__m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
|
|
u[0] = _mm_add_epi32(u[0], rnd);
|
|
u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
|
|
u[0] = _mm_packus_epi32(u[0], u[0]);
|
|
}
|
|
|
|
static void write2pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
|
|
highbdRndingPacks(u);
|
|
highbd_clip(u, 1, bd);
|
|
*(uint32_t *)dst = _mm_cvtsi128_si32(u[0]);
|
|
}
|
|
|
|
static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
|
|
__m128i v = _mm_loadl_epi64((__m128i const *)dst);
|
|
const __m128i ones = _mm_set1_epi16(1);
|
|
|
|
highbdRndingPacks(u);
|
|
highbd_clip(u, 1, bd);
|
|
|
|
v = _mm_add_epi16(v, u[0]);
|
|
v = _mm_add_epi16(v, ones);
|
|
v = _mm_srai_epi16(v, 1);
|
|
*(uint32_t *)dst = _mm_cvtsi128_si32(v);
|
|
}
|
|
|
|
WritePixels write2pixelsTab[2] = {write2pixelsOnly, write2pixelsAccum};
|
|
|
|
static void write4pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
|
|
highbdRndingPacks(u);
|
|
highbd_clip(u, 1, bd);
|
|
_mm_storel_epi64((__m128i *)dst, u[0]);
|
|
}
|
|
|
|
static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
|
|
__m128i v = _mm_loadl_epi64((__m128i const *)dst);
|
|
const __m128i ones = _mm_set1_epi16(1);
|
|
|
|
highbdRndingPacks(u);
|
|
highbd_clip(u, 1, bd);
|
|
|
|
v = _mm_add_epi16(v, u[0]);
|
|
v = _mm_add_epi16(v, ones);
|
|
v = _mm_srai_epi16(v, 1);
|
|
_mm_storel_epi64((__m128i *)dst, v);
|
|
}
|
|
|
|
WritePixels write4pixelsTab[2] = {write4pixelsOnly, write4pixelsAccum};
|
|
|
|
static void filter_vert_horiz_parallel(const uint16_t *src, int src_stride,
|
|
const __m128i *f, int taps,
|
|
uint16_t *dst, WritePixels saveFunc,
|
|
int bd) {
|
|
__m128i s[12];
|
|
__m128i zero = _mm_setzero_si128();
|
|
int i = 0;
|
|
int r = 0;
|
|
|
|
// TODO(luoyi) treat s[12] as a circular buffer in width = 2 case
|
|
if (10 == taps) {
|
|
i += 1;
|
|
s[0] = zero;
|
|
}
|
|
while (i < 12) {
|
|
s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
|
|
i += 1;
|
|
r += 1;
|
|
}
|
|
|
|
s[0] = _mm_unpacklo_epi16(s[0], s[1]);
|
|
s[2] = _mm_unpacklo_epi16(s[2], s[3]);
|
|
s[4] = _mm_unpacklo_epi16(s[4], s[5]);
|
|
s[6] = _mm_unpacklo_epi16(s[6], s[7]);
|
|
s[8] = _mm_unpacklo_epi16(s[8], s[9]);
|
|
s[10] = _mm_unpacklo_epi16(s[10], s[11]);
|
|
|
|
s[0] = _mm_madd_epi16(s[0], f[0]);
|
|
s[2] = _mm_madd_epi16(s[2], f[1]);
|
|
s[4] = _mm_madd_epi16(s[4], f[2]);
|
|
s[6] = _mm_madd_epi16(s[6], f[3]);
|
|
s[8] = _mm_madd_epi16(s[8], f[4]);
|
|
s[10] = _mm_madd_epi16(s[10], f[5]);
|
|
|
|
s[1] = _mm_min_epi32(s[4], s[6]);
|
|
s[3] = _mm_max_epi32(s[4], s[6]);
|
|
|
|
s[0] = _mm_add_epi32(s[0], s[2]);
|
|
s[0] = _mm_add_epi32(s[0], s[10]);
|
|
s[0] = _mm_add_epi32(s[0], s[8]);
|
|
s[0] = _mm_add_epi32(s[0], s[1]);
|
|
s[0] = _mm_add_epi32(s[0], s[3]);
|
|
|
|
saveFunc(s, bd, dst);
|
|
}
|
|
|
|
static void highbd_filter_vert_compute_large(const uint16_t *src,
|
|
int src_stride,
|
|
const __m128i *f, int taps,
|
|
int w, int h,
|
|
uint16_t *dst, int dst_stride,
|
|
int avg, int bd) {
|
|
int col;
|
|
int rowIndex = 0;
|
|
const uint16_t *src_ptr = src;
|
|
uint16_t *dst_ptr = dst;
|
|
const int step = 4;
|
|
WritePixels write4pixels = write4pixelsTab[avg];
|
|
|
|
do {
|
|
for (col = 0; col < w; col += step) {
|
|
filter_vert_horiz_parallel(src_ptr, src_stride, f, taps,
|
|
dst_ptr, write4pixels, bd);
|
|
src_ptr += step;
|
|
dst_ptr += step;
|
|
}
|
|
rowIndex++;
|
|
src_ptr = src + rowIndex * src_stride;
|
|
dst_ptr = dst + rowIndex * dst_stride;
|
|
} while (rowIndex < h);
|
|
}
|
|
|
|
static void highbd_filter_vert_compute_small(const uint16_t *src,
|
|
int src_stride,
|
|
const __m128i *f, int taps,
|
|
int w, int h,
|
|
uint16_t *dst, int dst_stride,
|
|
int avg, int bd) {
|
|
int rowIndex = 0;
|
|
WritePixels write2pixels = write2pixelsTab[avg];
|
|
(void)w;
|
|
|
|
do {
|
|
filter_vert_horiz_parallel(src, src_stride, f, taps, dst, write2pixels,
|
|
bd);
|
|
rowIndex++;
|
|
src += src_stride;
|
|
dst += dst_stride;
|
|
} while (rowIndex < h);
|
|
}
|
|
|
|
void vp10_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride,
|
|
uint16_t *dst, int dst_stride,
|
|
int w, int h,
|
|
const InterpFilterParams filter_params,
|
|
const int subpel_y_q4, int y_step_q4,
|
|
int avg, int bd) {
|
|
__m128i verf[6];
|
|
HbdSubpelFilterCoeffs vCoeffs;
|
|
const int tapsNum = filter_params.taps;
|
|
|
|
if (0 == subpel_y_q4 || 16 != y_step_q4) {
|
|
vp10_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
|
|
filter_params, subpel_y_q4, y_step_q4, avg,
|
|
bd);
|
|
return;
|
|
}
|
|
|
|
vCoeffs = vp10_hbd_get_subpel_filter_ver_signal_dir(
|
|
filter_params, subpel_y_q4 - 1);
|
|
if (!vCoeffs) {
|
|
vp10_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
|
|
filter_params, subpel_y_q4, y_step_q4, avg,
|
|
bd);
|
|
return;
|
|
}
|
|
|
|
verf[0] = *((const __m128i *)(vCoeffs));
|
|
verf[1] = *((const __m128i *)(vCoeffs + 1));
|
|
verf[2] = *((const __m128i *)(vCoeffs + 2));
|
|
verf[3] = *((const __m128i *)(vCoeffs + 3));
|
|
verf[4] = *((const __m128i *)(vCoeffs + 4));
|
|
verf[5] = *((const __m128i *)(vCoeffs + 5));
|
|
|
|
src -= src_stride * ((tapsNum >> 1) - 1);
|
|
|
|
if (w > 2) {
|
|
highbd_filter_vert_compute_large(src, src_stride, verf, tapsNum, w, h,
|
|
dst, dst_stride, avg, bd);
|
|
} else {
|
|
highbd_filter_vert_compute_small(src, src_stride, verf, tapsNum, w, h,
|
|
dst, dst_stride, avg, bd);
|
|
}
|
|
}
|