2014-10-07 16:36:14 -07:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
|
|
|
*
|
|
|
|
* Use of this source code is governed by a BSD-style license
|
|
|
|
* that can be found in the LICENSE file in the root of the source
|
|
|
|
* tree. An additional intellectual property rights grant can be found
|
|
|
|
* in the file PATENTS. All contributing project authors may
|
|
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
|
|
*/
|
|
|
|
|
2014-10-10 10:05:47 +02:00
|
|
|
#include <emmintrin.h>
|
2014-10-07 16:36:14 -07:00
|
|
|
#include "vpx_ports/mem.h"
|
|
|
|
|
2015-04-15 17:48:20 -07:00
|
|
|
void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
|
|
|
|
int *min, int *max) {
|
|
|
|
__m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
|
|
|
|
u0 = _mm_setzero_si128();
|
|
|
|
// Row 0
|
|
|
|
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
|
|
|
|
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
|
|
|
|
diff = _mm_subs_epi16(s0, d0);
|
|
|
|
negdiff = _mm_subs_epi16(u0, diff);
|
|
|
|
absdiff0 = _mm_max_epi16(diff, negdiff);
|
|
|
|
// Row 1
|
|
|
|
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
|
|
|
|
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
|
|
|
|
diff = _mm_subs_epi16(s0, d0);
|
|
|
|
negdiff = _mm_subs_epi16(u0, diff);
|
|
|
|
absdiff = _mm_max_epi16(diff, negdiff);
|
|
|
|
maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
|
|
|
|
minabsdiff = _mm_min_epi16(absdiff0, absdiff);
|
|
|
|
// Row 2
|
|
|
|
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
|
|
|
|
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
|
|
|
|
diff = _mm_subs_epi16(s0, d0);
|
|
|
|
negdiff = _mm_subs_epi16(u0, diff);
|
|
|
|
absdiff = _mm_max_epi16(diff, negdiff);
|
|
|
|
maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
|
|
|
|
minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
|
|
|
|
// Row 3
|
|
|
|
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
|
|
|
|
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
|
|
|
|
diff = _mm_subs_epi16(s0, d0);
|
|
|
|
negdiff = _mm_subs_epi16(u0, diff);
|
|
|
|
absdiff = _mm_max_epi16(diff, negdiff);
|
|
|
|
maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
|
|
|
|
minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
|
|
|
|
// Row 4
|
|
|
|
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
|
|
|
|
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
|
|
|
|
diff = _mm_subs_epi16(s0, d0);
|
|
|
|
negdiff = _mm_subs_epi16(u0, diff);
|
|
|
|
absdiff = _mm_max_epi16(diff, negdiff);
|
|
|
|
maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
|
|
|
|
minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
|
|
|
|
// Row 5
|
|
|
|
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
|
|
|
|
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
|
|
|
|
diff = _mm_subs_epi16(s0, d0);
|
|
|
|
negdiff = _mm_subs_epi16(u0, diff);
|
|
|
|
absdiff = _mm_max_epi16(diff, negdiff);
|
|
|
|
maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
|
|
|
|
minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
|
|
|
|
// Row 6
|
|
|
|
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
|
|
|
|
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
|
|
|
|
diff = _mm_subs_epi16(s0, d0);
|
|
|
|
negdiff = _mm_subs_epi16(u0, diff);
|
|
|
|
absdiff = _mm_max_epi16(diff, negdiff);
|
|
|
|
maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
|
|
|
|
minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
|
|
|
|
// Row 7
|
|
|
|
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
|
|
|
|
d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
|
|
|
|
diff = _mm_subs_epi16(s0, d0);
|
|
|
|
negdiff = _mm_subs_epi16(u0, diff);
|
|
|
|
absdiff = _mm_max_epi16(diff, negdiff);
|
|
|
|
maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
|
|
|
|
minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
|
|
|
|
|
|
|
|
maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
|
|
|
|
maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
|
|
|
|
maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
|
|
|
|
*max = _mm_extract_epi16(maxabsdiff, 0);
|
|
|
|
|
|
|
|
minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
|
|
|
|
minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
|
|
|
|
minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
|
|
|
|
*min = _mm_extract_epi16(minabsdiff, 0);
|
|
|
|
}
|
2014-10-07 16:36:14 -07:00
|
|
|
|
|
|
|
unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {
|
|
|
|
__m128i s0, s1, u0;
|
|
|
|
unsigned int avg = 0;
|
|
|
|
u0 = _mm_setzero_si128();
|
|
|
|
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
|
|
|
|
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
|
|
|
|
s0 = _mm_adds_epu16(s0, s1);
|
|
|
|
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
|
|
|
|
s0 = _mm_adds_epu16(s0, s1);
|
|
|
|
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
|
|
|
|
s0 = _mm_adds_epu16(s0, s1);
|
|
|
|
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
|
|
|
|
s0 = _mm_adds_epu16(s0, s1);
|
|
|
|
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
|
|
|
|
s0 = _mm_adds_epu16(s0, s1);
|
|
|
|
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
|
|
|
|
s0 = _mm_adds_epu16(s0, s1);
|
|
|
|
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
|
|
|
|
s0 = _mm_adds_epu16(s0, s1);
|
|
|
|
|
|
|
|
s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
|
|
|
|
s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
|
|
|
|
s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
|
|
|
|
avg = _mm_extract_epi16(s0, 0);
|
|
|
|
return (avg + 32) >> 6;
|
|
|
|
}
|
2014-11-12 14:51:49 -08:00
|
|
|
|
|
|
|
unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) {
|
|
|
|
__m128i s0, s1, u0;
|
|
|
|
unsigned int avg = 0;
|
|
|
|
u0 = _mm_setzero_si128();
|
|
|
|
s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
|
|
|
|
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
|
|
|
|
s0 = _mm_adds_epu16(s0, s1);
|
|
|
|
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
|
|
|
|
s0 = _mm_adds_epu16(s0, s1);
|
|
|
|
s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
|
|
|
|
s0 = _mm_adds_epu16(s0, s1);
|
|
|
|
|
|
|
|
s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
|
|
|
|
s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
|
|
|
|
avg = _mm_extract_epi16(s0, 0);
|
|
|
|
return (avg + 8) >> 4;
|
|
|
|
}
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
|
2015-03-23 10:02:42 -07:00
|
|
|
static void hadamard_col8_sse2(__m128i *in, int iter) {
|
|
|
|
__m128i a0 = in[0];
|
|
|
|
__m128i a1 = in[1];
|
|
|
|
__m128i a2 = in[2];
|
|
|
|
__m128i a3 = in[3];
|
|
|
|
__m128i a4 = in[4];
|
|
|
|
__m128i a5 = in[5];
|
|
|
|
__m128i a6 = in[6];
|
|
|
|
__m128i a7 = in[7];
|
|
|
|
|
|
|
|
__m128i b0 = _mm_add_epi16(a0, a1);
|
|
|
|
__m128i b1 = _mm_sub_epi16(a0, a1);
|
|
|
|
__m128i b2 = _mm_add_epi16(a2, a3);
|
|
|
|
__m128i b3 = _mm_sub_epi16(a2, a3);
|
|
|
|
__m128i b4 = _mm_add_epi16(a4, a5);
|
|
|
|
__m128i b5 = _mm_sub_epi16(a4, a5);
|
|
|
|
__m128i b6 = _mm_add_epi16(a6, a7);
|
|
|
|
__m128i b7 = _mm_sub_epi16(a6, a7);
|
|
|
|
|
|
|
|
a0 = _mm_add_epi16(b0, b2);
|
|
|
|
a1 = _mm_add_epi16(b1, b3);
|
|
|
|
a2 = _mm_sub_epi16(b0, b2);
|
|
|
|
a3 = _mm_sub_epi16(b1, b3);
|
|
|
|
a4 = _mm_add_epi16(b4, b6);
|
|
|
|
a5 = _mm_add_epi16(b5, b7);
|
|
|
|
a6 = _mm_sub_epi16(b4, b6);
|
|
|
|
a7 = _mm_sub_epi16(b5, b7);
|
|
|
|
|
|
|
|
if (iter == 0) {
|
|
|
|
b0 = _mm_add_epi16(a0, a4);
|
2015-03-30 15:02:54 -07:00
|
|
|
b7 = _mm_add_epi16(a1, a5);
|
|
|
|
b3 = _mm_add_epi16(a2, a6);
|
|
|
|
b4 = _mm_add_epi16(a3, a7);
|
|
|
|
b2 = _mm_sub_epi16(a0, a4);
|
|
|
|
b6 = _mm_sub_epi16(a1, a5);
|
|
|
|
b1 = _mm_sub_epi16(a2, a6);
|
|
|
|
b5 = _mm_sub_epi16(a3, a7);
|
2015-03-23 10:02:42 -07:00
|
|
|
|
|
|
|
a0 = _mm_unpacklo_epi16(b0, b1);
|
|
|
|
a1 = _mm_unpacklo_epi16(b2, b3);
|
|
|
|
a2 = _mm_unpackhi_epi16(b0, b1);
|
|
|
|
a3 = _mm_unpackhi_epi16(b2, b3);
|
|
|
|
a4 = _mm_unpacklo_epi16(b4, b5);
|
|
|
|
a5 = _mm_unpacklo_epi16(b6, b7);
|
|
|
|
a6 = _mm_unpackhi_epi16(b4, b5);
|
|
|
|
a7 = _mm_unpackhi_epi16(b6, b7);
|
|
|
|
|
|
|
|
b0 = _mm_unpacklo_epi32(a0, a1);
|
|
|
|
b1 = _mm_unpacklo_epi32(a4, a5);
|
|
|
|
b2 = _mm_unpackhi_epi32(a0, a1);
|
|
|
|
b3 = _mm_unpackhi_epi32(a4, a5);
|
|
|
|
b4 = _mm_unpacklo_epi32(a2, a3);
|
|
|
|
b5 = _mm_unpacklo_epi32(a6, a7);
|
|
|
|
b6 = _mm_unpackhi_epi32(a2, a3);
|
|
|
|
b7 = _mm_unpackhi_epi32(a6, a7);
|
|
|
|
|
|
|
|
in[0] = _mm_unpacklo_epi64(b0, b1);
|
2015-03-30 15:02:54 -07:00
|
|
|
in[1] = _mm_unpackhi_epi64(b0, b1);
|
|
|
|
in[2] = _mm_unpacklo_epi64(b2, b3);
|
|
|
|
in[3] = _mm_unpackhi_epi64(b2, b3);
|
|
|
|
in[4] = _mm_unpacklo_epi64(b4, b5);
|
|
|
|
in[5] = _mm_unpackhi_epi64(b4, b5);
|
|
|
|
in[6] = _mm_unpacklo_epi64(b6, b7);
|
|
|
|
in[7] = _mm_unpackhi_epi64(b6, b7);
|
2015-03-23 10:02:42 -07:00
|
|
|
} else {
|
|
|
|
in[0] = _mm_add_epi16(a0, a4);
|
|
|
|
in[7] = _mm_add_epi16(a1, a5);
|
|
|
|
in[3] = _mm_add_epi16(a2, a6);
|
|
|
|
in[4] = _mm_add_epi16(a3, a7);
|
|
|
|
in[2] = _mm_sub_epi16(a0, a4);
|
|
|
|
in[6] = _mm_sub_epi16(a1, a5);
|
|
|
|
in[1] = _mm_sub_epi16(a2, a6);
|
|
|
|
in[5] = _mm_sub_epi16(a3, a7);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
|
|
|
|
int16_t *coeff) {
|
|
|
|
__m128i src[8];
|
|
|
|
src[0] = _mm_load_si128((const __m128i *)src_diff);
|
|
|
|
src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
|
|
|
|
src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
|
|
|
|
src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
|
|
|
|
src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
|
|
|
|
src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
|
|
|
|
src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
|
|
|
|
src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
|
|
|
|
|
|
|
|
hadamard_col8_sse2(src, 0);
|
|
|
|
hadamard_col8_sse2(src, 1);
|
|
|
|
|
2015-03-31 10:08:29 -07:00
|
|
|
_mm_store_si128((__m128i *)coeff, src[0]);
|
2015-03-23 10:02:42 -07:00
|
|
|
coeff += 8;
|
2015-03-31 10:08:29 -07:00
|
|
|
_mm_store_si128((__m128i *)coeff, src[1]);
|
2015-03-23 10:02:42 -07:00
|
|
|
coeff += 8;
|
2015-03-31 10:08:29 -07:00
|
|
|
_mm_store_si128((__m128i *)coeff, src[2]);
|
2015-03-23 10:02:42 -07:00
|
|
|
coeff += 8;
|
2015-03-31 10:08:29 -07:00
|
|
|
_mm_store_si128((__m128i *)coeff, src[3]);
|
2015-03-23 10:02:42 -07:00
|
|
|
coeff += 8;
|
2015-03-31 10:08:29 -07:00
|
|
|
_mm_store_si128((__m128i *)coeff, src[4]);
|
2015-03-23 10:02:42 -07:00
|
|
|
coeff += 8;
|
2015-03-31 10:08:29 -07:00
|
|
|
_mm_store_si128((__m128i *)coeff, src[5]);
|
2015-03-23 10:02:42 -07:00
|
|
|
coeff += 8;
|
2015-03-31 10:08:29 -07:00
|
|
|
_mm_store_si128((__m128i *)coeff, src[6]);
|
2015-03-23 10:02:42 -07:00
|
|
|
coeff += 8;
|
2015-03-31 10:08:29 -07:00
|
|
|
_mm_store_si128((__m128i *)coeff, src[7]);
|
2015-03-23 10:02:42 -07:00
|
|
|
}
|
|
|
|
|
2015-03-30 12:31:46 -07:00
|
|
|
void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
|
|
|
|
int16_t *coeff) {
|
|
|
|
int idx;
|
|
|
|
for (idx = 0; idx < 4; ++idx) {
|
|
|
|
int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
|
|
|
|
+ (idx & 0x01) * 8;
|
|
|
|
vp9_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (idx = 0; idx < 64; idx += 8) {
|
|
|
|
__m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
|
|
|
|
__m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
|
|
|
|
__m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
|
|
|
|
__m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
|
|
|
|
|
|
|
|
__m128i b0 = _mm_add_epi16(coeff0, coeff1);
|
|
|
|
__m128i b1 = _mm_sub_epi16(coeff0, coeff1);
|
|
|
|
__m128i b2 = _mm_add_epi16(coeff2, coeff3);
|
|
|
|
__m128i b3 = _mm_sub_epi16(coeff2, coeff3);
|
|
|
|
|
|
|
|
coeff0 = _mm_add_epi16(b0, b2);
|
|
|
|
coeff1 = _mm_add_epi16(b1, b3);
|
|
|
|
coeff0 = _mm_srai_epi16(coeff0, 1);
|
|
|
|
coeff1 = _mm_srai_epi16(coeff1, 1);
|
|
|
|
_mm_store_si128((__m128i *)coeff, coeff0);
|
|
|
|
_mm_store_si128((__m128i *)(coeff + 64), coeff1);
|
|
|
|
|
|
|
|
coeff2 = _mm_sub_epi16(b0, b2);
|
|
|
|
coeff3 = _mm_sub_epi16(b1, b3);
|
|
|
|
coeff2 = _mm_srai_epi16(coeff2, 1);
|
|
|
|
coeff3 = _mm_srai_epi16(coeff3, 1);
|
|
|
|
_mm_store_si128((__m128i *)(coeff + 128), coeff2);
|
|
|
|
_mm_store_si128((__m128i *)(coeff + 192), coeff3);
|
|
|
|
|
|
|
|
coeff += 8;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-03-23 10:02:42 -07:00
|
|
|
int16_t vp9_satd_sse2(const int16_t *coeff, int length) {
|
|
|
|
int i;
|
|
|
|
__m128i sum = _mm_load_si128((const __m128i *)coeff);
|
|
|
|
__m128i sign = _mm_srai_epi16(sum, 15);
|
|
|
|
__m128i val = _mm_xor_si128(sum, sign);
|
|
|
|
sum = _mm_sub_epi16(val, sign);
|
|
|
|
coeff += 8;
|
|
|
|
|
|
|
|
for (i = 8; i < length; i += 8) {
|
|
|
|
__m128i src_line = _mm_load_si128((const __m128i *)coeff);
|
|
|
|
sign = _mm_srai_epi16(src_line, 15);
|
|
|
|
val = _mm_xor_si128(src_line, sign);
|
|
|
|
val = _mm_sub_epi16(val, sign);
|
|
|
|
sum = _mm_add_epi16(sum, val);
|
|
|
|
coeff += 8;
|
|
|
|
}
|
|
|
|
|
|
|
|
val = _mm_srli_si128(sum, 8);
|
|
|
|
sum = _mm_add_epi16(sum, val);
|
|
|
|
val = _mm_srli_epi64(sum, 32);
|
|
|
|
sum = _mm_add_epi16(sum, val);
|
|
|
|
val = _mm_srli_epi32(sum, 16);
|
|
|
|
sum = _mm_add_epi16(sum, val);
|
|
|
|
|
|
|
|
return _mm_extract_epi16(sum, 0);
|
|
|
|
}
|
|
|
|
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
|
|
|
|
const int ref_stride, const int height) {
|
|
|
|
int idx;
|
|
|
|
__m128i zero = _mm_setzero_si128();
|
2015-03-09 18:55:38 -07:00
|
|
|
__m128i src_line = _mm_loadu_si128((const __m128i *)ref);
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
__m128i s0 = _mm_unpacklo_epi8(src_line, zero);
|
|
|
|
__m128i s1 = _mm_unpackhi_epi8(src_line, zero);
|
|
|
|
__m128i t0, t1;
|
|
|
|
int height_1 = height - 1;
|
|
|
|
ref += ref_stride;
|
|
|
|
|
|
|
|
for (idx = 1; idx < height_1; idx += 2) {
|
2015-03-09 18:55:38 -07:00
|
|
|
src_line = _mm_loadu_si128((const __m128i *)ref);
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
t0 = _mm_unpacklo_epi8(src_line, zero);
|
|
|
|
t1 = _mm_unpackhi_epi8(src_line, zero);
|
|
|
|
s0 = _mm_adds_epu16(s0, t0);
|
|
|
|
s1 = _mm_adds_epu16(s1, t1);
|
|
|
|
ref += ref_stride;
|
|
|
|
|
2015-03-09 18:55:38 -07:00
|
|
|
src_line = _mm_loadu_si128((const __m128i *)ref);
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
t0 = _mm_unpacklo_epi8(src_line, zero);
|
|
|
|
t1 = _mm_unpackhi_epi8(src_line, zero);
|
|
|
|
s0 = _mm_adds_epu16(s0, t0);
|
|
|
|
s1 = _mm_adds_epu16(s1, t1);
|
|
|
|
ref += ref_stride;
|
|
|
|
}
|
|
|
|
|
2015-03-09 18:55:38 -07:00
|
|
|
src_line = _mm_loadu_si128((const __m128i *)ref);
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
t0 = _mm_unpacklo_epi8(src_line, zero);
|
|
|
|
t1 = _mm_unpackhi_epi8(src_line, zero);
|
|
|
|
s0 = _mm_adds_epu16(s0, t0);
|
|
|
|
s1 = _mm_adds_epu16(s1, t1);
|
|
|
|
|
2015-03-02 10:28:12 -08:00
|
|
|
if (height == 64) {
|
|
|
|
s0 = _mm_srai_epi16(s0, 5);
|
|
|
|
s1 = _mm_srai_epi16(s1, 5);
|
|
|
|
} else if (height == 32) {
|
|
|
|
s0 = _mm_srai_epi16(s0, 4);
|
|
|
|
s1 = _mm_srai_epi16(s1, 4);
|
|
|
|
} else {
|
|
|
|
s0 = _mm_srai_epi16(s0, 3);
|
|
|
|
s1 = _mm_srai_epi16(s1, 3);
|
|
|
|
}
|
2015-02-27 13:35:22 -08:00
|
|
|
|
2015-03-09 18:55:38 -07:00
|
|
|
_mm_storeu_si128((__m128i *)hbuf, s0);
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
hbuf += 8;
|
2015-03-09 18:55:38 -07:00
|
|
|
_mm_storeu_si128((__m128i *)hbuf, s1);
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
|
|
|
|
__m128i zero = _mm_setzero_si128();
|
|
|
|
__m128i src_line = _mm_load_si128((const __m128i *)ref);
|
|
|
|
__m128i s0 = _mm_sad_epu8(src_line, zero);
|
|
|
|
__m128i s1;
|
2015-02-23 14:43:06 -08:00
|
|
|
int i;
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
|
2015-02-23 14:43:06 -08:00
|
|
|
for (i = 16; i < width; i += 16) {
|
|
|
|
ref += 16;
|
|
|
|
src_line = _mm_load_si128((const __m128i *)ref);
|
|
|
|
s1 = _mm_sad_epu8(src_line, zero);
|
|
|
|
s0 = _mm_adds_epu16(s0, s1);
|
|
|
|
}
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
|
|
|
|
s1 = _mm_srli_si128(s0, 8);
|
|
|
|
s0 = _mm_adds_epu16(s0, s1);
|
|
|
|
|
2015-03-16 12:03:31 -07:00
|
|
|
return _mm_extract_epi16(s0, 0);
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
}
|
|
|
|
|
2015-02-27 13:35:22 -08:00
|
|
|
int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src,
|
|
|
|
const int bwl) {
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
int idx;
|
2015-02-27 13:35:22 -08:00
|
|
|
int width = 4 << bwl;
|
|
|
|
int16_t mean;
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
__m128i v0 = _mm_loadu_si128((const __m128i *)ref);
|
|
|
|
__m128i v1 = _mm_load_si128((const __m128i *)src);
|
|
|
|
__m128i diff = _mm_subs_epi16(v0, v1);
|
2015-02-27 13:35:22 -08:00
|
|
|
__m128i sum = diff;
|
|
|
|
__m128i sse = _mm_madd_epi16(diff, diff);
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
|
|
|
|
ref += 8;
|
|
|
|
src += 8;
|
|
|
|
|
2015-02-23 14:43:06 -08:00
|
|
|
for (idx = 8; idx < width; idx += 8) {
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
v0 = _mm_loadu_si128((const __m128i *)ref);
|
|
|
|
v1 = _mm_load_si128((const __m128i *)src);
|
|
|
|
diff = _mm_subs_epi16(v0, v1);
|
|
|
|
|
2015-02-27 13:35:22 -08:00
|
|
|
sum = _mm_add_epi16(sum, diff);
|
|
|
|
v0 = _mm_madd_epi16(diff, diff);
|
|
|
|
sse = _mm_add_epi32(sse, v0);
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
|
|
|
|
ref += 8;
|
|
|
|
src += 8;
|
|
|
|
}
|
|
|
|
|
2015-02-27 13:35:22 -08:00
|
|
|
v0 = _mm_srli_si128(sum, 8);
|
|
|
|
sum = _mm_add_epi16(sum, v0);
|
|
|
|
v0 = _mm_srli_epi64(sum, 32);
|
|
|
|
sum = _mm_add_epi16(sum, v0);
|
|
|
|
v0 = _mm_srli_epi32(sum, 16);
|
|
|
|
sum = _mm_add_epi16(sum, v0);
|
|
|
|
|
|
|
|
v1 = _mm_srli_si128(sse, 8);
|
|
|
|
sse = _mm_add_epi32(sse, v1);
|
|
|
|
v1 = _mm_srli_epi64(sse, 32);
|
|
|
|
sse = _mm_add_epi32(sse, v1);
|
|
|
|
|
|
|
|
mean = _mm_extract_epi16(sum, 0);
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
|
2015-02-27 13:35:22 -08:00
|
|
|
return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
|
Integral projection based motion estimation
This commit introduces a new block match motion estimation
using integral projection measurement. The 2-D block and the nearby
region is projected onto the horizontal and vertical 1-D vectors,
respectively. It then runs vector match, instead of block match,
over the two separate 1-D vectors to locate the motion compensated
reference block.
This process is run per 64x64 block to align the reference before
choosing partitioning in speed 6. The overall CPU cycle cost due
to this additional 64x64 block match (SSE2 version) takes around 2%
at low bit-rate rtc speed 6. When strong motion activities exist in
the video sequence, it substantially improves the partition
selection accuracy, thereby achieving better compression performance
and lower CPU cycles.
The experiments were tested in RTC speed -6 setting:
cloud 1080p 500 kbps
17006 b/f, 37.086 dB, 5386 ms ->
16669 b/f, 37.970 dB, 5085 ms (>0.9dB gain and 6% faster)
pedestrian_area 1080p 500 kbps
53537 b/f, 36.771 dB, 18706 ms ->
51897 b/f, 36.792 dB, 18585 ms (4% bit-rate savings)
blue_sky 1080p 500 kbps
70214 b/f, 33.600 dB, 13979 ms ->
53885 b/f, 33.645 dB, 10878 ms (30% bit-rate savings, 25% faster)
jimred 400 kbps
13380 b/f, 36.014 dB, 5723 ms ->
13377 b/f, 36.087 dB, 5831 ms (2% bit-rate savings, 2% slower)
Change-Id: Iffdb6ea5b16b77016bfa3dd3904d284168ae649c
2015-02-13 11:23:45 -08:00
|
|
|
}
|