dsp/dec_sse2: DC16 / DC16NoLeft speedup
use psadbw to perform top row summation; left remains in C as repacking it into a vector to apply the same operation is too costly. DC16: ~20% faster DC16NoLeft: ~14% faster Change-Id: I7ec3f8a6e5923f88a530f79fceb88d5001bef691
This commit is contained in:
parent
8e515dfeda
commit
7df2049785
@ -1139,12 +1139,20 @@ static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
|
||||
}
|
||||
|
||||
static void DC16(uint8_t* dst) { // DC
|
||||
int DC = 16;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
|
||||
const __m128i sad8x2 = _mm_sad_epu8(top, zero);
|
||||
// sum the two sads: sad8x2[0:1] + sad8x2[8:9]
|
||||
const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
|
||||
int left = 0;
|
||||
int j;
|
||||
for (j = 0; j < 16; ++j) {
|
||||
DC += dst[-1 + j * BPS] + dst[j - BPS];
|
||||
left += dst[-1 + j * BPS];
|
||||
}
|
||||
{
|
||||
const int DC = _mm_cvtsi128_si32(sum) + left + 16;
|
||||
Put16(DC >> 5, dst);
|
||||
}
|
||||
Put16(DC >> 5, dst);
|
||||
}
|
||||
|
||||
static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
|
||||
@ -1157,11 +1165,12 @@ static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
|
||||
}
|
||||
|
||||
static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available
|
||||
int DC = 8;
|
||||
int i;
|
||||
for (i = 0; i < 16; ++i) {
|
||||
DC += dst[i - BPS];
|
||||
}
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
|
||||
const __m128i sad8x2 = _mm_sad_epu8(top, zero);
|
||||
// sum the two sads: sad8x2[0:1] + sad8x2[8:9]
|
||||
const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
|
||||
const int DC = _mm_cvtsi128_si32(sum) + 8;
|
||||
Put16(DC >> 4, dst);
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user