SSE implementation of decoding predictors.

Change-Id: I5c9ae63afc98013cb45ce8a91f051203ac68402c
This commit is contained in:
Vincent Rabaud 2016-11-30 12:00:05 +01:00
parent 34aee99026
commit 67879e6d48
3 changed files with 150 additions and 5 deletions

View File

@ -600,6 +600,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
VP8LPredictors[13] = Predictor13;
VP8LPredictors[14] = Predictor0; // <- padding security sentinels
VP8LPredictors[15] = Predictor0;
memcpy(VP8LPredictors_C, VP8LPredictors, sizeof(VP8LPredictors));
VP8LPredictorsAdd[0] = PredictorAdd0;
VP8LPredictorsAdd[1] = PredictorAdd1;
@ -617,6 +618,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
VP8LPredictorsAdd[13] = PredictorAdd13;
VP8LPredictorsAdd[14] = PredictorAdd0; // <- padding security sentinels
VP8LPredictorsAdd[15] = PredictorAdd0;
memcpy(VP8LPredictorsAdd_C, VP8LPredictorsAdd, sizeof(VP8LPredictorsAdd));
VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C;

View File

@ -34,10 +34,12 @@ extern "C" {
typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
extern VP8LPredictorFunc VP8LPredictors[16];
VP8LPredictorFunc VP8LPredictors_C[16];
typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
const uint32_t* upper, int num_pixels,
uint32_t* out);
extern VP8LPredictorAddSubFunc VP8LPredictorsAdd[16];
VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
typedef void (*VP8LProcessDecBlueAndRedFunc)(const uint32_t* src,
int num_pixels, uint32_t* dst);

View File

@ -155,15 +155,151 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
return pred;
}
// TODO(vrabaud): implement those functions in SSE.
// Batch versions of those functions.
// Predictor0: ARGB_BLACK.
static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
const __m128i black = _mm_set1_epi32(ARGB_BLACK);
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
const __m128i res = _mm_add_epi8(src, black);
_mm_storeu_si128((__m128i*)&out[i], res);
}
VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i);
}
// Predictor1: left.
static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
// a | b | c | d
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
// 0 | a | b | c
const __m128i shift0 = _mm_slli_si128(src, 4);
// a | a + b | b + c | c + d
const __m128i sum0 = _mm_add_epi8(src, shift0);
// 0 | 0 | a | a + b
const __m128i shift1 = _mm_slli_si128(sum0, 8);
// a | a + b | a + b + c | a + b + c + d
const __m128i sum1 = _mm_add_epi8(sum0, shift1);
const __m128i prev = _mm_set1_epi32(out[i - 1]);
const __m128i res = _mm_add_epi8(sum1, prev);
_mm_storeu_si128((__m128i*)&out[i], res);
}
VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);
}
// Macro that adds 32-bit integers from IN using mod 256 arithmetic
// per 8 bit channel.
#define GENERATE_PREDICTOR_1(X, IN) \
static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
int num_pixels, uint32_t* out) { \
int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
const __m128i other = _mm_loadu_si128((const __m128i*)&(IN)); \
const __m128i res = _mm_add_epi8(src, other); \
_mm_storeu_si128((__m128i*)&out[i], res); \
} \
VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
}
// Predictor2: Top.
GENERATE_PREDICTOR_1(2, upper[i])
// Predictor3: Top-right.
GENERATE_PREDICTOR_1(3, upper[i + 1])
// Predictor4: Top-left.
GENERATE_PREDICTOR_1(4, upper[i - 1])
#undef GENERATE_PREDICTOR_1
// Due to averages with integers, values cannot be accumulated in parallel for
// predictors 5 to 10.
GENERATE_PREDICTOR_ADD(5)
GENERATE_PREDICTOR_ADD(6)
GENERATE_PREDICTOR_ADD(7)
GENERATE_PREDICTOR_ADD(8)
GENERATE_PREDICTOR_ADD(9)
GENERATE_PREDICTOR_ADD(10)
GENERATE_PREDICTOR_ADD(11)
GENERATE_PREDICTOR_ADD(12)
// Predictor11: select.
static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i, j;
const __m128i zero = _mm_setzero_si128();
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
__m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
const __m128i TTL0 = _mm_subs_epu8(T, TL);
const __m128i TLT0 = _mm_subs_epu8(TL, T);
// |T - TL|
__m128i TTL = _mm_or_si128(TTL0, TLT0);
// in + T
__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
__m128i sumTin =
_mm_add_epi8(src, _mm_loadu_si128((const __m128i*)&upper[i]));
for (j = 0; j < 4; ++j) {
int pa_minus_pb;
const __m128i L = _mm_cvtsi32_si128(out[i + j - 1]);
const __m128i LTL0 = _mm_subs_epu8(L, TL);
const __m128i TLL0 = _mm_subs_epu8(TL, L);
const __m128i LTL = _mm_or_si128(LTL0, TLL0);
const __m128i pTTL = _mm_unpacklo_epi8(TTL, zero); // |T - TL|
const __m128i pLTL = _mm_unpacklo_epi8(LTL, zero); // |L - TL|
const __m128i diff = _mm_sub_epi16(pLTL, pTTL);
{
int16_t tmp[8];
_mm_storeu_si128((__m128i*)tmp, diff);
pa_minus_pb = tmp[0] + tmp[1] + tmp[2] + tmp[3];
}
if (pa_minus_pb <= 0) {
// Add to upper (pre-computed value).
out[i + j] = _mm_cvtsi128_si32(sumTin);
} else {
// Add to left.
out[i + j] = _mm_cvtsi128_si32(_mm_add_epi8(src, L));
}
// Shift the pre-computed value for the next iteration.
TTL = _mm_srli_si128(TTL, 4);
TL = _mm_srli_si128(TL, 4);
src = _mm_srli_si128(src, 4);
sumTin = _mm_srli_si128(sumTin, 4);
}
}
VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
}
// Predictor12: ClampedAddSubtractFull.
static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i, j;
const __m128i zero = _mm_setzero_si128();
// +4 to not read outside of memory.
for (i = 0; i + 4 <= num_pixels; i += 2) {
__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
const __m128i T8 = _mm_loadu_si128((const __m128i*)&upper[i]);
const __m128i T = _mm_unpacklo_epi8(T8, zero);
const __m128i TL8 = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
const __m128i TL = _mm_unpacklo_epi8(TL8, zero);
__m128i diff = _mm_sub_epi16(T, TL);
for (j = 0; j < 2; ++j) {
const __m128i L8 = _mm_cvtsi32_si128(out[i + j - 1]);
const __m128i L = _mm_unpacklo_epi8(L8, zero);
const __m128i all = _mm_add_epi16(L, diff);
const __m128i alls = _mm_packus_epi16(all, all);
out[i + j] = _mm_cvtsi128_si32(_mm_add_epi8(src, alls));
// Shift the pre-computed value for the next iteration.
diff = _mm_srli_si128(diff, 8);
src = _mm_srli_si128(src, 4);
}
}
VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);
}
// Due to averages with integers, values cannot be accumulated in parallel for
// predictors 13.
GENERATE_PREDICTOR_ADD(13)
//------------------------------------------------------------------------------
@ -406,14 +542,19 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
VP8LPredictors[12] = Predictor12;
VP8LPredictors[13] = Predictor13;
VP8LPredictorsAdd[0] = PredictorAdd0_SSE2;
VP8LPredictorsAdd[1] = PredictorAdd1_SSE2;
VP8LPredictorsAdd[2] = PredictorAdd2_SSE2;
VP8LPredictorsAdd[3] = PredictorAdd3_SSE2;
VP8LPredictorsAdd[4] = PredictorAdd4_SSE2;
VP8LPredictorsAdd[5] = PredictorAdd5;
VP8LPredictorsAdd[6] = PredictorAdd6;
VP8LPredictorsAdd[7] = PredictorAdd7;
VP8LPredictorsAdd[8] = PredictorAdd8;
VP8LPredictorsAdd[9] = PredictorAdd9;
VP8LPredictorsAdd[10] = PredictorAdd10;
VP8LPredictorsAdd[11] = PredictorAdd11;
VP8LPredictorsAdd[12] = PredictorAdd12;
VP8LPredictorsAdd[11] = PredictorAdd11_SSE2;
VP8LPredictorsAdd[12] = PredictorAdd12_SSE2;
VP8LPredictorsAdd[13] = PredictorAdd13;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;