Reduce WHT complexity.

Saves 1 add, 3 shifts (and a shift bias) per 1-D transform.

Change-Id: I1104bb1679fe342b2f9677df8a9cdc0cb9699e7d
This commit is contained in:
Timothy B. Terriberry
2013-05-17 10:11:30 -07:00
committed by Yaowu Xu
parent 75fca6fff0
commit 95339d6825
2 changed files with 80 additions and 48 deletions

View File

@@ -19,22 +19,32 @@
#include "vp9/common/vp9_idct.h" #include "vp9/common/vp9_idct.h"
void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) { void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
0.5 shifts per pixel. */
int i; int i;
int16_t output[16]; int16_t output[16];
int a1, b1, c1, d1; int a1, b1, c1, d1, e1;
int16_t *ip = input; int16_t *ip = input;
int16_t *op = output; int16_t *op = output;
for (i = 0; i < 4; i++) { for (i = 0; i < 4; i++) {
a1 = (ip[0] + ip[3]) >> WHT_UPSCALE_FACTOR; a1 = ip[0] >> WHT_UPSCALE_FACTOR;
b1 = (ip[1] + ip[2]) >> WHT_UPSCALE_FACTOR; c1 = ip[1] >> WHT_UPSCALE_FACTOR;
c1 = (ip[1] - ip[2]) >> WHT_UPSCALE_FACTOR; d1 = ip[2] >> WHT_UPSCALE_FACTOR;
d1 = (ip[0] - ip[3]) >> WHT_UPSCALE_FACTOR; b1 = ip[3] >> WHT_UPSCALE_FACTOR;
op[0] = (a1 + b1 + 1) >> 1; c1 = a1 - c1;
op[1] = (c1 + d1) >> 1; b1 += d1;
op[2] = (a1 - b1) >> 1; e1 = (c1 - b1) >> 1;
op[3] = (d1 - c1) >> 1; a1 -= e1;
d1 += e1;
b1 = a1 - b1;
c1 -= d1;
op[0] = a1;
op[1] = b1;
op[2] = c1;
op[3] = d1;
ip += 4; ip += 4;
op += 4; op += 4;
@@ -42,20 +52,23 @@ void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
ip = output; ip = output;
for (i = 0; i < 4; i++) { for (i = 0; i < 4; i++) {
a1 = ip[4 * 0] + ip[4 * 3]; a1 = ip[4 * 0];
b1 = ip[4 * 1] + ip[4 * 2]; c1 = ip[4 * 1];
c1 = ip[4 * 1] - ip[4 * 2]; d1 = ip[4 * 2];
d1 = ip[4 * 0] - ip[4 * 3]; b1 = ip[4 * 3];
c1 = a1 - c1;
b1 += d1;
e1 = (c1 - b1) >> 1;
a1 -= e1;
d1 += e1;
b1 = a1 - b1;
c1 -= d1;
dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
((a1 + b1 + 1) >> 1)); dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + b1);
dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + c1);
((c1 + d1) >> 1)); dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + d1);
dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] +
((a1 - b1) >> 1));
dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] +
((d1 - c1) >> 1));
ip++; ip++;
dest++; dest++;
@@ -64,23 +77,24 @@ void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) { void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {
int i; int i;
int a1, e1;
int16_t tmp[4]; int16_t tmp[4];
int16_t *ip = in; int16_t *ip = in;
int16_t *op = tmp; int16_t *op = tmp;
op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1; a1 = ip[0] >> WHT_UPSCALE_FACTOR;
op[1] = op[2] = op[3] = (ip[0] >> WHT_UPSCALE_FACTOR) >> 1; e1 = a1 >> 1;
op[0] = op[1] = op[2] = a1 - e1;
op[3] = e1;
ip = tmp; ip = tmp;
for (i = 0; i < 4; i++) { for (i = 0; i < 4; i++) {
dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + e1 = ip[0] >> 1;
((ip[0] + 1) >> 1)); a1 = ip[0] - e1;
dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
(ip[0] >> 1)); dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + a1);
dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + a1);
(ip[0] >> 1)); dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);
dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] +
(ip[0] >> 1));
ip++; ip++;
dest++; dest++;
} }

View File

@@ -591,23 +591,33 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
} }
} }
/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
pixel. */
void vp9_short_walsh4x4_c(short *input, short *output, int pitch) { void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
int i; int i;
int a1, b1, c1, d1; int a1, b1, c1, d1, e1;
short *ip = input; short *ip = input;
short *op = output; short *op = output;
int pitch_short = pitch >> 1; int pitch_short = pitch >> 1;
for (i = 0; i < 4; i++) { for (i = 0; i < 4; i++) {
a1 = ip[0 * pitch_short] + ip[3 * pitch_short]; a1 = ip[0 * pitch_short];
b1 = ip[1 * pitch_short] + ip[2 * pitch_short]; b1 = ip[1 * pitch_short];
c1 = ip[1 * pitch_short] - ip[2 * pitch_short]; c1 = ip[2 * pitch_short];
d1 = ip[0 * pitch_short] - ip[3 * pitch_short]; d1 = ip[3 * pitch_short];
op[0] = (a1 + b1 + 1) >> 1; b1 = a1 - b1;
op[4] = (c1 + d1) >> 1; c1 += d1;
op[8] = (a1 - b1) >> 1; e1 = (c1 - b1) >> 1;
op[12] = (d1 - c1) >> 1; a1 += e1;
d1 -= e1;
c1 = a1 - c1;
b1 -= d1;
op[0] = a1;
op[4] = c1;
op[8] = d1;
op[12] = b1;
ip++; ip++;
op++; op++;
@@ -616,15 +626,23 @@ void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
op = output; op = output;
for (i = 0; i < 4; i++) { for (i = 0; i < 4; i++) {
a1 = ip[0] + ip[3]; a1 = ip[0];
b1 = ip[1] + ip[2]; b1 = ip[1];
c1 = ip[1] - ip[2]; c1 = ip[2];
d1 = ip[0] - ip[3]; d1 = ip[3];
op[0] = ((a1 + b1 + 1) >> 1) << WHT_UPSCALE_FACTOR; b1 = a1 - b1;
op[1] = ((c1 + d1) >> 1) << WHT_UPSCALE_FACTOR; c1 += d1;
op[2] = ((a1 - b1) >> 1) << WHT_UPSCALE_FACTOR; e1 = (c1 - b1) >> 1;
op[3] = ((d1 - c1) >> 1) << WHT_UPSCALE_FACTOR; a1 += e1;
d1 -= e1;
c1 = a1 - c1;
b1 -= d1;
op[0] = a1 << WHT_UPSCALE_FACTOR;
op[1] = c1 << WHT_UPSCALE_FACTOR;
op[2] = d1 << WHT_UPSCALE_FACTOR;
op[3] = b1 << WHT_UPSCALE_FACTOR;
ip += 4; ip += 4;
op += 4; op += 4;