Merge "Optimize bilateral filter to improve speed" into nextgen

This commit is contained in:
Shunyao Li 2015-07-07 20:17:12 +00:00 committed by Gerrit Code Review
commit e8885d4a50
2 changed files with 109 additions and 62 deletions

View File

@ -10,6 +10,7 @@
#include <math.h> #include <math.h>
#include "./vpx_config.h" #include "./vpx_config.h"
#include "vp9/common/vp9_loopfilter.h" #include "vp9/common/vp9_loopfilter.h"
#include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_onyxc_int.h"
@ -233,6 +234,67 @@ static const int mode_lf_lut[MB_MODE_COUNT] = {
static const int bilateral_weight = (1 << BILATERAL_WEIGHT_BITS) - 1; static const int bilateral_weight = (1 << BILATERAL_WEIGHT_BITS) - 1;
static const int bilateral_weight_round = 1 << (BILATERAL_WEIGHT_BITS - 1); static const int bilateral_weight_round = 1 << (BILATERAL_WEIGHT_BITS - 1);
static double bilateral_filters_r_kf[BILATERAL_LEVELS_KF + 1][513];
static double bilateral_filters_r[BILATERAL_LEVELS + 1][513];
static double bilateral_filters_s_kf[BILATERAL_LEVELS_KF + 1]
[BILATERAL_WIN][BILATERAL_WIN];
static double bilateral_filters_s[BILATERAL_LEVELS + 1]
[BILATERAL_WIN][BILATERAL_WIN];
void vp9_loop_bilateral_precal() {
int i;
for (i = 1; i < BILATERAL_LEVELS_KF + 1; i ++) {
const bilateral_params_t param = vp9_bilateral_level_to_params(i, 1);
const int sigma_x = param.sigma_x;
const int sigma_y = param.sigma_y;
const int sigma_r = param.sigma_r;
const double sigma_r_d = (double)sigma_r / BILATERAL_PRECISION;
const double sigma_x_d = (double)sigma_x / BILATERAL_PRECISION;
const double sigma_y_d = (double)sigma_y / BILATERAL_PRECISION;
double *fr = bilateral_filters_r_kf[i] + 256;
int j, x, y;
for (j = 0; j <= 256; j++) {
fr[j] = exp(-(j * j) / (2 * sigma_r_d * sigma_r_d));
fr[-j] = fr[j];
}
for (y = -BILATERAL_HALFWIN; y <= BILATERAL_HALFWIN; y++) {
for (x = -BILATERAL_HALFWIN; x <= BILATERAL_HALFWIN; x++) {
bilateral_filters_s_kf[i][y + BILATERAL_HALFWIN]
[x + BILATERAL_HALFWIN] =
exp(-(x * x) / (2 * sigma_x_d * sigma_x_d)
-(y * y) / (2 * sigma_y_d * sigma_y_d));
}
}
}
for (i = 1; i < BILATERAL_LEVELS + 1; i ++) {
const bilateral_params_t param = vp9_bilateral_level_to_params(i, 0);
const int sigma_x = param.sigma_x;
const int sigma_y = param.sigma_y;
const int sigma_r = param.sigma_r;
const double sigma_r_d = (double)sigma_r / BILATERAL_PRECISION;
const double sigma_x_d = (double)sigma_x / BILATERAL_PRECISION;
const double sigma_y_d = (double)sigma_y / BILATERAL_PRECISION;
double *fr = bilateral_filters_r[i] + 256;
int j, x, y;
for (j = 0; j <= 256; j++) {
fr[j] = exp(-(j * j) / (2 * sigma_r_d * sigma_r_d));
fr[-j] = fr[j];
}
for (y = -BILATERAL_HALFWIN; y <= BILATERAL_HALFWIN; y++) {
for (x = -BILATERAL_HALFWIN; x <= BILATERAL_HALFWIN; x++) {
bilateral_filters_s[i][y + BILATERAL_HALFWIN][x + BILATERAL_HALFWIN] =
exp(-(x * x) / (2 * sigma_x_d * sigma_x_d)
-(y * y) / (2 * sigma_y_d * sigma_y_d));
}
}
}
}
int vp9_bilateral_level_bits(const VP9_COMMON *const cm) { int vp9_bilateral_level_bits(const VP9_COMMON *const cm) {
return cm->frame_type == KEY_FRAME ? return cm->frame_type == KEY_FRAME ?
BILATERAL_LEVEL_BITS_KF : BILATERAL_LEVEL_BITS; BILATERAL_LEVEL_BITS_KF : BILATERAL_LEVEL_BITS;
@ -244,35 +306,15 @@ int vp9_loop_bilateral_used(int level, int kf) {
} }
void vp9_loop_bilateral_init(loop_filter_info_n *lfi, int level, int kf) { void vp9_loop_bilateral_init(loop_filter_info_n *lfi, int level, int kf) {
const bilateral_params_t param = vp9_bilateral_level_to_params(level, kf);
lfi->bilateral_used = vp9_loop_bilateral_used(level, kf); lfi->bilateral_used = vp9_loop_bilateral_used(level, kf);
if (lfi->bilateral_used) { if (lfi->bilateral_used) {
if (param.sigma_x != lfi->bilateral_sigma_x_set || int i;
param.sigma_y != lfi->bilateral_sigma_y_set || lfi->wr_lut = kf ? bilateral_filters_r_kf[level] :
param.sigma_r != lfi->bilateral_sigma_r_set) { bilateral_filters_r[level];
const int sigma_x = param.sigma_x; for (i = 0; i < BILATERAL_WIN; i++)
const int sigma_y = param.sigma_y; lfi->wx_lut[i] = kf ? bilateral_filters_s_kf[level][i] :
const int sigma_r = param.sigma_r; bilateral_filters_s[level][i];
const double sigma_r_d = (double)sigma_r / BILATERAL_PRECISION;
const double sigma_x_d = (double)sigma_x / BILATERAL_PRECISION;
const double sigma_y_d = (double)sigma_y / BILATERAL_PRECISION;
double *wr_lut_ = lfi->wr_lut + 255;
double *wx_lut_ = lfi->wx_lut + BILATERAL_HALFWIN * (1 + BILATERAL_WIN);
int i, x, y;
for (i = 0; i < 256; i++) {
wr_lut_[i] = exp(-(i * i) / (2 * sigma_r_d * sigma_r_d));
wr_lut_[-i] = wr_lut_[i];
}
for (y = -BILATERAL_HALFWIN; y <= BILATERAL_HALFWIN; y++)
for (x = -BILATERAL_HALFWIN; x <= BILATERAL_HALFWIN; x++) {
wx_lut_[y * BILATERAL_WIN + x] =
exp(-(x * x) / (2 * sigma_x_d * sigma_x_d) -
(y * y) / (2 * sigma_y_d * sigma_y_d));
}
lfi->bilateral_sigma_x_set = sigma_x;
lfi->bilateral_sigma_y_set = sigma_y;
lfi->bilateral_sigma_r_set = sigma_r;
}
} }
} }
@ -284,33 +326,33 @@ void loop_bilateral_filter(uint8_t *data, int width, int height,
int stride, loop_filter_info_n *lfi, int stride, loop_filter_info_n *lfi,
uint8_t *tmpdata, int tmpstride) { uint8_t *tmpdata, int tmpstride) {
int i, j; int i, j;
const double *wr_lut_ = lfi->wr_lut + 255; const double *wr_lut_ = lfi->wr_lut + 256;
const double *wx_lut_ = lfi->wx_lut + BILATERAL_HALFWIN * (1 + BILATERAL_WIN);
uint8_t *data_p = data;
uint8_t *tmpdata_p = tmpdata;
for (i = 0; i < height; ++i) { for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) { for (j = 0; j < width; ++j) {
int x, y; int x, y;
double wt; double flsum = 0, wtsum = 0, wt;
double flsum = 0; uint8_t *data_p2 = data_p + j - BILATERAL_HALFWIN * stride;
double wtsum = 0;
uint8_t *v = data + i * stride + j;
uint8_t *z = tmpdata + i * tmpstride + j;
uint8_t *d;
for (y = -BILATERAL_HALFWIN; y <= BILATERAL_HALFWIN; ++y) { for (y = -BILATERAL_HALFWIN; y <= BILATERAL_HALFWIN; ++y) {
for (x = -BILATERAL_HALFWIN; x <= BILATERAL_HALFWIN; ++x) { for (x = -BILATERAL_HALFWIN; x <= BILATERAL_HALFWIN; ++x) {
if (!is_in_image(j + x, i + y, width, height)) if (!is_in_image(j + x, i + y, width, height))
continue; continue;
d = data + (i + y) * stride + (j + x); wt = lfi->wx_lut[y + BILATERAL_HALFWIN][x + BILATERAL_HALFWIN] *
wt = wr_lut_[d[0] - v[0]] * wx_lut_[y * BILATERAL_WIN + x]; wr_lut_[data_p2[x] - data_p[j]];
wtsum += wt; wtsum += wt;
flsum += wt * d[0]; flsum += wt * data_p2[x];
} }
data_p2 += stride;
} }
if (wtsum > 0) assert(wtsum > 0);
z[0] = (int)(flsum / wtsum + 0.5); tmpdata_p[j] = clip_pixel((int)(flsum / wtsum + 0.5));
else
z[0] = v[0];
} }
tmpdata_p += tmpstride;
data_p += stride;
} }
for (i = 0; i < height; ++i) { for (i = 0; i < height; ++i) {
vpx_memcpy(data + i * stride, tmpdata + i * tmpstride, vpx_memcpy(data + i * stride, tmpdata + i * tmpstride,
width * sizeof(*data)); width * sizeof(*data));
@ -322,36 +364,41 @@ void loop_bilateral_filter_highbd(uint8_t *data8, int width, int height,
int stride, loop_filter_info_n *lfi, int stride, loop_filter_info_n *lfi,
uint8_t *tmpdata8, int tmpstride, int bit_depth) { uint8_t *tmpdata8, int tmpstride, int bit_depth) {
int i, j; int i, j;
const double *wr_lut_ = lfi->wr_lut + 255; const double *wr_lut_ = lfi->wr_lut + 256;
const double *wx_lut_ = lfi->wx_lut + BILATERAL_HALFWIN * (1 + BILATERAL_WIN);
uint16_t *data = CONVERT_TO_SHORTPTR(data8); uint16_t *data = CONVERT_TO_SHORTPTR(data8);
uint16_t *tmpdata = CONVERT_TO_SHORTPTR(tmpdata8); uint16_t *tmpdata = CONVERT_TO_SHORTPTR(tmpdata8);
uint16_t *data_p = data;
uint16_t *tmpdata_p = tmpdata;
for (i = 0; i < height; ++i) { for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) { for (j = 0; j < width; ++j) {
int x, y; int x, y, diff_r;
double wt; double flsum = 0, wtsum = 0, wt;
double flsum = 0; uint16_t *data_p2 = data_p + j - BILATERAL_HALFWIN * stride;
double wtsum = 0;
uint16_t *v = data + i * stride + j;
uint16_t *z = tmpdata + i * tmpstride + j;
uint16_t *d;
for (y = -BILATERAL_HALFWIN; y <= BILATERAL_HALFWIN; ++y) { for (y = -BILATERAL_HALFWIN; y <= BILATERAL_HALFWIN; ++y) {
for (x = -BILATERAL_HALFWIN; x <= BILATERAL_HALFWIN; ++x) { for (x = -BILATERAL_HALFWIN; x <= BILATERAL_HALFWIN; ++x) {
if (!is_in_image(j + x, i + y, width, height)) if (!is_in_image(j + x, i + y, width, height))
continue; continue;
d = data + (i + y) * stride + (j + x);
wt = wr_lut_[(d[0] - v[0]) >> (bit_depth - 8)] * diff_r = (data_p2[x] - data_p[j]) >> (bit_depth - 8);
wx_lut_[y * BILATERAL_WIN + x]; assert(diff_r >= -256 && diff_r <= 256);
wt = lfi->wx_lut[y + BILATERAL_HALFWIN][x + BILATERAL_HALFWIN] *
wr_lut_[diff_r];
wtsum += wt; wtsum += wt;
flsum += wt * d[0]; flsum += wt * data_p2[x];
} }
data_p2 += stride;
} }
if (wtsum > 0)
z[0] = (int)(flsum / wtsum + 0.5); assert(wtsum > 0);
else tmpdata_p[j] = (int)(flsum / wtsum + 0.5);
z[0] = v[0];
} }
tmpdata_p += tmpstride;
data_p += stride;
} }
for (i = 0; i < height; ++i) { for (i = 0; i < height; ++i) {
vpx_memcpy(data + i * stride, tmpdata + i * tmpstride, vpx_memcpy(data + i * stride, tmpdata + i * tmpstride,
width * sizeof(*data)); width * sizeof(*data));
@ -468,7 +515,7 @@ void vp9_loop_filter_init(VP9_COMMON *cm) {
vpx_memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH); vpx_memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
#if CONFIG_LOOP_POSTFILTER #if CONFIG_LOOP_POSTFILTER
vp9_loop_bilateral_init(lfi, DEF_BILATERAL_LEVEL, 1); vp9_loop_bilateral_precal();
#endif // CONFIG_LOOP_POSTFILTER #endif // CONFIG_LOOP_POSTFILTER
} }

View File

@ -129,8 +129,8 @@ typedef struct {
loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1]; loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS]; uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
#if CONFIG_LOOP_POSTFILTER #if CONFIG_LOOP_POSTFILTER
double wx_lut[BILATERAL_WIN * BILATERAL_WIN]; double * wx_lut[BILATERAL_WIN];
double wr_lut[512]; double * wr_lut;
int bilateral_sigma_x_set; int bilateral_sigma_x_set;
int bilateral_sigma_y_set; int bilateral_sigma_y_set;
int bilateral_sigma_r_set; int bilateral_sigma_r_set;