fix loop bug in DispatchAlpha()
* We were re-doing most of the work in plain-C as 'left-over'. * we were always returning has_alpha = true because of a bad mask all_0xff These bugs were conservative and silent, in the sense that we were 'just' doing more work than necessary. Now, the SSE2 version is really 2x faster than the C version. Change-Id: I6c8132a267fe3c7a3d1fa70e7a5fcd10719543fa
This commit is contained in:
parent
3101f53720
commit
690b491af1
@ -210,7 +210,7 @@ static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
|
||||
const int has_alpha = WebPDispatchAlpha(alpha, io->width, mb_w,
|
||||
num_rows, dst, buf->stride);
|
||||
|
||||
// alpha_mask is < 0xff if there's non-trivial alpha to premultiply with.
|
||||
// has_alpha is true if there's non-trivial alpha to premultiply with.
|
||||
if (has_alpha && WebPIsPremultipliedMode(colorspace)) {
|
||||
WebPApplyAlphaMultiply(base_rgba, alpha_first,
|
||||
mb_w, num_rows, buf->stride);
|
||||
|
@ -27,20 +27,19 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
|
||||
int i, j;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u); // to preserve RGB
|
||||
const __m128i all_0xff = _mm_set_epi32(~0u, ~0u, 0, 0);
|
||||
const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
|
||||
__m128i all_alphas = all_0xff;
|
||||
|
||||
// We must be able to access 3 extra bytes after the last written byte
|
||||
// 'dst[4 * width - 4]', because we don't know if alpha is the first or the
|
||||
// last byte of the quadruplet.
|
||||
const int limit = (width - 1) >> 3;
|
||||
const int limit = (width - 1) & ~7;
|
||||
|
||||
for (j = 0; j < height; ++j) {
|
||||
const uint8_t* in = alpha;
|
||||
__m128i* out = (__m128i*)dst;
|
||||
for (i = 0; i < limit; ++i) {
|
||||
for (i = 0; i < limit; i += 8) {
|
||||
// load 8 alpha bytes
|
||||
const __m128i a0 = _mm_loadl_epi64((__m128i*)in); // zeroes upper bytes
|
||||
const __m128i a0 = _mm_loadl_epi64((__m128i*)&alpha[i]);
|
||||
const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
|
||||
const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
|
||||
const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
|
||||
@ -59,7 +58,6 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
|
||||
// accumulate eight alpha 'and' in parallel
|
||||
all_alphas = _mm_and_si128(all_alphas, a0);
|
||||
out += 2;
|
||||
in += 8;
|
||||
}
|
||||
for (; i < width; ++i) {
|
||||
const uint32_t alpha_value = alpha[i];
|
||||
|
Loading…
x
Reference in New Issue
Block a user