2014-10-07 16:36:14 -07:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
|
|
|
*
|
|
|
|
* Use of this source code is governed by a BSD-style license
|
|
|
|
* that can be found in the LICENSE file in the root of the source
|
|
|
|
* tree. An additional intellectual property rights grant can be found
|
|
|
|
* in the file PATENTS. All contributing project authors may
|
|
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
|
|
*/
|
2015-05-14 20:08:03 -07:00
|
|
|
#include "./vp9_rtcd.h"
|
2014-10-16 14:36:07 +01:00
|
|
|
#include "vp9/common/vp9_common.h"
|
2014-10-07 16:36:14 -07:00
|
|
|
#include "vpx_ports/mem.h"
|
|
|
|
|
|
|
|
unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) {
|
|
|
|
int i, j;
|
|
|
|
int sum = 0;
|
|
|
|
for (i = 0; i < 8; ++i, s+=p)
|
|
|
|
for (j = 0; j < 8; sum += s[j], ++j) {}
|
|
|
|
|
|
|
|
return (sum + 32) >> 6;
|
|
|
|
}
|
2014-10-16 14:36:07 +01:00
|
|
|
|
2014-11-12 14:51:49 -08:00
|
|
|
unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) {
|
|
|
|
int i, j;
|
|
|
|
int sum = 0;
|
|
|
|
for (i = 0; i < 4; ++i, s+=p)
|
|
|
|
for (j = 0; j < 4; sum += s[j], ++j) {}
|
|
|
|
|
|
|
|
return (sum + 8) >> 4;
|
|
|
|
}
|
|
|
|
|
2015-06-12 10:38:45 -07:00
|
|
|
// src_diff: first pass, 9 bit, dynamic range [-255, 255]
|
|
|
|
// second pass, 12 bit, dynamic range [-2040, 2040]
|
2015-03-23 10:02:42 -07:00
|
|
|
static void hadamard_col8(const int16_t *src_diff, int src_stride,
|
|
|
|
int16_t *coeff) {
|
|
|
|
int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
|
|
|
|
int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
|
|
|
|
int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
|
|
|
|
int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
|
|
|
|
int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
|
|
|
|
int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
|
|
|
|
int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
|
|
|
|
int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
|
|
|
|
|
|
|
|
int16_t c0 = b0 + b2;
|
|
|
|
int16_t c1 = b1 + b3;
|
|
|
|
int16_t c2 = b0 - b2;
|
|
|
|
int16_t c3 = b1 - b3;
|
|
|
|
int16_t c4 = b4 + b6;
|
|
|
|
int16_t c5 = b5 + b7;
|
|
|
|
int16_t c6 = b4 - b6;
|
|
|
|
int16_t c7 = b5 - b7;
|
|
|
|
|
|
|
|
coeff[0] = c0 + c4;
|
|
|
|
coeff[7] = c1 + c5;
|
|
|
|
coeff[3] = c2 + c6;
|
|
|
|
coeff[4] = c3 + c7;
|
|
|
|
coeff[2] = c0 - c4;
|
|
|
|
coeff[6] = c1 - c5;
|
|
|
|
coeff[1] = c2 - c6;
|
|
|
|
coeff[5] = c3 - c7;
|
|
|
|
}
|
|
|
|
|
|
|
|
void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
|
|
|
|
int16_t *coeff) {
|
|
|
|
int idx;
|
|
|
|
int16_t buffer[64];
|
|
|
|
int16_t *tmp_buf = &buffer[0];
|
|
|
|
for (idx = 0; idx < 8; ++idx) {
|
2015-06-12 10:38:45 -07:00
|
|
|
hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit
|
|
|
|
// dynamic range [-255, 255]
|
2015-03-23 10:02:42 -07:00
|
|
|
tmp_buf += 8;
|
|
|
|
++src_diff;
|
|
|
|
}
|
|
|
|
|
|
|
|
tmp_buf = &buffer[0];
|
|
|
|
for (idx = 0; idx < 8; ++idx) {
|
2015-06-12 10:38:45 -07:00
|
|
|
hadamard_col8(tmp_buf, 8, coeff); // tmp_buf: 12 bit
|
|
|
|
// dynamic range [-2040, 2040]
|
|
|
|
coeff += 8; // coeff: 15 bit
|
|
|
|
// dynamic range [-16320, 16320]
|
2015-03-23 10:02:42 -07:00
|
|
|
++tmp_buf;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// In place 16x16 2D Hadamard transform
|
2015-03-30 12:31:46 -07:00
|
|
|
void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride,
|
|
|
|
int16_t *coeff) {
|
2015-03-23 10:02:42 -07:00
|
|
|
int idx;
|
2015-03-30 12:31:46 -07:00
|
|
|
for (idx = 0; idx < 4; ++idx) {
|
2015-06-12 10:53:43 -07:00
|
|
|
// src_diff: 9 bit, dynamic range [-255, 255]
|
2015-03-30 12:31:46 -07:00
|
|
|
int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
|
|
|
|
+ (idx & 0x01) * 8;
|
|
|
|
vp9_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
|
|
|
|
}
|
|
|
|
|
2015-06-12 10:53:43 -07:00
|
|
|
// coeff: 15 bit, dynamic range [-16320, 16320]
|
2015-03-23 10:02:42 -07:00
|
|
|
for (idx = 0; idx < 64; ++idx) {
|
|
|
|
int16_t a0 = coeff[0];
|
|
|
|
int16_t a1 = coeff[64];
|
|
|
|
int16_t a2 = coeff[128];
|
|
|
|
int16_t a3 = coeff[192];
|
|
|
|
|
2015-06-12 10:53:43 -07:00
|
|
|
int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640]
|
|
|
|
int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range
|
|
|
|
int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320]
|
|
|
|
int16_t b3 = (a2 - a3) >> 1;
|
2015-03-23 10:02:42 -07:00
|
|
|
|
2015-06-12 10:53:43 -07:00
|
|
|
coeff[0] = b0 + b2; // 16 bit, [-32640, 32640]
|
|
|
|
coeff[64] = b1 + b3;
|
|
|
|
coeff[128] = b0 - b2;
|
|
|
|
coeff[192] = b1 - b3;
|
2015-03-23 10:02:42 -07:00
|
|
|
|
|
|
|
++coeff;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int16_t vp9_satd_c(const int16_t *coeff, int length) {
|
|
|
|
int i;
|
|
|
|
int satd = 0;
|
|
|
|
for (i = 0; i < length; ++i)
|
|
|
|
satd += abs(coeff[i]);
|
|
|
|
|
|
|
|
return (int16_t)satd;
|
|
|
|
}
|
|
|
|
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
// Integer projection onto row vectors.
|
|
|
|
void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref,
|
|
|
|
const int ref_stride, const int height) {
|
|
|
|
int idx;
|
2015-03-02 10:28:12 -08:00
|
|
|
const int norm_factor = MAX(8, height >> 1);
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
for (idx = 0; idx < 16; ++idx) {
|
|
|
|
int i;
|
|
|
|
hbuf[idx] = 0;
|
|
|
|
for (i = 0; i < height; ++i)
|
|
|
|
hbuf[idx] += ref[i * ref_stride];
|
2015-03-02 10:28:12 -08:00
|
|
|
hbuf[idx] /= norm_factor;
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
++ref;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width) {
|
|
|
|
int idx;
|
|
|
|
int16_t sum = 0;
|
|
|
|
for (idx = 0; idx < width; ++idx)
|
|
|
|
sum += ref[idx];
|
2015-03-17 18:50:53 -07:00
|
|
|
return sum;
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
}
|
|
|
|
|
2015-02-27 13:35:22 -08:00
|
|
|
int vp9_vector_var_c(int16_t const *ref, int16_t const *src,
|
|
|
|
const int bwl) {
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
int i;
|
2015-02-27 13:35:22 -08:00
|
|
|
int width = 4 << bwl;
|
|
|
|
int sse = 0, mean = 0, var;
|
|
|
|
|
|
|
|
for (i = 0; i < width; ++i) {
|
|
|
|
int diff = ref[i] - src[i];
|
|
|
|
mean += diff;
|
|
|
|
sse += diff * diff;
|
|
|
|
}
|
|
|
|
|
|
|
|
var = sse - ((mean * mean) >> (bwl + 2));
|
|
|
|
return var;
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
}
|
|
|
|
|
2015-04-15 17:48:20 -07:00
|
|
|
void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
|
|
|
|
int *min, int *max) {
|
|
|
|
int i, j;
|
|
|
|
*min = 255;
|
|
|
|
*max = 0;
|
|
|
|
for (i = 0; i < 8; ++i, s += p, d += dp) {
|
|
|
|
for (j = 0; j < 8; ++j) {
|
|
|
|
int diff = abs(s[j]-d[j]);
|
|
|
|
*min = diff < *min ? diff : *min;
|
|
|
|
*max = diff > *max ? diff : *max;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-10-16 14:36:07 +01:00
|
|
|
#if CONFIG_VP9_HIGHBITDEPTH
|
|
|
|
unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
|
|
|
|
int i, j;
|
|
|
|
int sum = 0;
|
|
|
|
const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
|
|
|
|
for (i = 0; i < 8; ++i, s+=p)
|
|
|
|
for (j = 0; j < 8; sum += s[j], ++j) {}
|
|
|
|
|
|
|
|
return (sum + 32) >> 6;
|
|
|
|
}
|
2014-11-12 14:51:49 -08:00
|
|
|
|
|
|
|
unsigned int vp9_highbd_avg_4x4_c(const uint8_t *s8, int p) {
|
|
|
|
int i, j;
|
|
|
|
int sum = 0;
|
|
|
|
const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
|
|
|
|
for (i = 0; i < 4; ++i, s+=p)
|
|
|
|
for (j = 0; j < 4; sum += s[j], ++j) {}
|
|
|
|
|
|
|
|
return (sum + 8) >> 4;
|
|
|
|
}
|
2015-04-15 17:48:20 -07:00
|
|
|
|
|
|
|
void vp9_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
|
|
|
|
int dp, int *min, int *max) {
|
|
|
|
int i, j;
|
|
|
|
const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
|
|
|
|
const uint16_t* d = CONVERT_TO_SHORTPTR(d8);
|
2015-04-20 09:50:59 -07:00
|
|
|
*min = 255;
|
|
|
|
*max = 0;
|
2015-04-15 17:48:20 -07:00
|
|
|
for (i = 0; i < 8; ++i, s += p, d += dp) {
|
|
|
|
for (j = 0; j < 8; ++j) {
|
|
|
|
int diff = abs(s[j]-d[j]);
|
|
|
|
*min = diff < *min ? diff : *min;
|
|
|
|
*max = diff > *max ? diff : *max;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2014-10-16 14:36:07 +01:00
|
|
|
#endif // CONFIG_VP9_HIGHBITDEPTH
|
|
|
|
|
2014-11-12 14:51:49 -08:00
|
|
|
|