Remove memcpy in lossless decoding.

Change-Id: Iba694b306486d67764e2fc5576c98a974c9b886c
This commit is contained in:
Vincent Rabaud 2016-11-24 17:45:22 +01:00
parent 7474d46e45
commit 71e2f5cadf
8 changed files with 151 additions and 112 deletions

View File

@ -712,13 +712,15 @@ static void ApplyInverseTransforms(VP8LDecoder* const dec, int num_rows,
uint32_t* const rows_out = dec->argb_cache_;
// Inverse transforms.
// TODO: most transforms only need to operate on the cropped region only.
memcpy(rows_out, rows_in, cache_pixs * sizeof(*rows_out));
while (n-- > 0) {
VP8LTransform* const transform = &dec->transforms_[n];
VP8LInverseTransform(transform, start_row, end_row, rows_in, rows_out);
rows_in = rows_out;
}
if (rows_in != rows_out) {
// No transform called, hence just copy.
memcpy(rows_out, rows_in, cache_pixs * sizeof(*rows_out));
}
}
// Processes (transforms, scales & color-converts) the rows decoded after the

View File

@ -234,15 +234,16 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
// Add green to blue and red channels (i.e. perform the inverse transform of
// 'subtract green').
void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels) {
void VP8LAddGreenToBlueAndRed_C(const uint32_t* const src, int num_pixels,
uint32_t* dst) {
int i;
for (i = 0; i < num_pixels; ++i) {
const uint32_t argb = data[i];
const uint32_t argb = src[i];
const uint32_t green = ((argb >> 8) & 0xff);
uint32_t red_blue = (argb & 0x00ff00ffu);
red_blue += (green << 16) | green;
red_blue &= 0x00ff00ffu;
data[i] = (argb & 0xff00ff00u) | red_blue;
dst[i] = (argb & 0xff00ff00u) | red_blue;
}
}
@ -258,11 +259,12 @@ static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
m->red_to_blue_ = (color_code >> 16) & 0xff;
}
void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, uint32_t* data,
int num_pixels) {
void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
const uint32_t* const src, int num_pixels,
uint32_t* const dst) {
int i;
for (i = 0; i < num_pixels; ++i) {
const uint32_t argb = data[i];
const uint32_t argb = src[i];
const uint32_t green = argb >> 8;
const uint32_t red = argb >> 16;
int new_red = red;
@ -272,13 +274,14 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, uint32_t* data,
new_blue += ColorTransformDelta(m->green_to_blue_, green);
new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
new_blue &= 0xff;
data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
dst[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
}
}
// Color space inverse transform.
static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
int y_start, int y_end, uint32_t* data) {
int y_start, int y_end,
const uint32_t* src, uint32_t* dst) {
const int width = transform->xsize_;
const int tile_width = 1 << transform->bits_;
const int mask = tile_width - 1;
@ -292,17 +295,19 @@ static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
while (y < y_end) {
const uint32_t* pred = pred_row;
VP8LMultipliers m = { 0, 0, 0 };
const uint32_t* const data_safe_end = data + safe_width;
const uint32_t* const data_end = data + width;
while (data < data_safe_end) {
const uint32_t* const src_safe_end = src + safe_width;
const uint32_t* const src_end = src + width;
while (src < src_safe_end) {
ColorCodeToMultipliers(*pred++, &m);
VP8LTransformColorInverse(&m, data, tile_width);
data += tile_width;
VP8LTransformColorInverse(&m, src, tile_width, dst);
src += tile_width;
dst += tile_width;
}
if (data < data_end) { // Left-overs using C-version.
if (src < src_end) { // Left-overs using C-version.
ColorCodeToMultipliers(*pred++, &m);
VP8LTransformColorInverse(&m, data, remaining_width);
data += remaining_width;
VP8LTransformColorInverse(&m, src, remaining_width, dst);
src += remaining_width;
dst += remaining_width;
}
++y;
if ((y & mask) == 0) pred_row += tiles_per_row;
@ -367,9 +372,13 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
assert(row_end <= transform->ysize_);
switch (transform->type_) {
case SUBTRACT_GREEN:
VP8LAddGreenToBlueAndRed(out, (row_end - row_start) * width);
VP8LAddGreenToBlueAndRed(in, (row_end - row_start) * width, out);
break;
case PREDICTOR_TRANSFORM:
// TODO(vrabaud): parallelize transform predictors.
if (in != out) {
memcpy(out, in, (row_end - row_start) * width * sizeof(*out));
}
PredictorInverseTransform(transform, row_start, row_end, out);
if (row_end != transform->ysize_) {
// The last predicted row in this iteration will be the top-pred row
@ -379,7 +388,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
}
break;
case CROSS_COLOR_TRANSFORM:
ColorSpaceInverseTransform(transform, row_start, row_end, out);
ColorSpaceInverseTransform(transform, row_start, row_end, in, out);
break;
case COLOR_INDEXING_TRANSFORM:
if (in == out && transform->bits_ > 0) {
@ -556,10 +565,10 @@ void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
//------------------------------------------------------------------------------
VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
VP8LProcessDecBlueAndRedFunc VP8LAddGreenToBlueAndRed;
VP8LPredictorFunc VP8LPredictors[16];
VP8LTransformColorFunc VP8LTransformColorInverse;
VP8LTransformColorInverseFunc VP8LTransformColorInverse;
VP8LConvertFunc VP8LConvertBGRAToRGB;
VP8LConvertFunc VP8LConvertBGRAToRGBA;

View File

@ -35,8 +35,9 @@ extern "C" {
typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
extern VP8LPredictorFunc VP8LPredictors[16];
typedef void (*VP8LProcessBlueAndRedFunc)(uint32_t* argb_data, int num_pixels);
extern VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
typedef void (*VP8LProcessDecBlueAndRedFunc)(const uint32_t* src,
int num_pixels, uint32_t* dst);
extern VP8LProcessDecBlueAndRedFunc VP8LAddGreenToBlueAndRed;
typedef struct {
// Note: the members are uint8_t, so that any negative values are
@ -45,9 +46,10 @@ typedef struct {
uint8_t green_to_blue_;
uint8_t red_to_blue_;
} VP8LMultipliers;
typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels);
extern VP8LTransformColorFunc VP8LTransformColorInverse;
typedef void (*VP8LTransformColorInverseFunc)(const VP8LMultipliers* const m,
const uint32_t* src,
int num_pixels, uint32_t* dst);
extern VP8LTransformColorInverseFunc VP8LTransformColorInverse;
struct VP8LTransform; // Defined in dec/vp8li.h.
@ -93,7 +95,8 @@ void VP8LColorIndexInverseTransformAlpha(
// Expose some C-only fallback functions
void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
uint32_t* data, int num_pixels);
const uint32_t* src, int num_pixels,
uint32_t* dst);
void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst);
void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst);
@ -102,7 +105,8 @@ void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
int num_pixels, uint8_t* dst);
void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst);
void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels);
void VP8LAddGreenToBlueAndRed_C(const uint32_t* src, int num_pixels,
uint32_t* dst);
// Must be called before calling any of the above methods.
void VP8LDspInit(void);
@ -110,7 +114,10 @@ void VP8LDspInit(void);
//------------------------------------------------------------------------------
// Encoding
extern VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels);
extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
uint32_t* const dst, int num_pixels);
extern VP8LTransformColorFunc VP8LTransformColor;
typedef void (*VP8LCollectColorBlueTransformsFunc)(
const uint32_t* argb, int stride,

View File

@ -665,7 +665,7 @@ static void HistogramAdd(const VP8LHistogram* const a,
//------------------------------------------------------------------------------
VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
VP8LTransformColorFunc VP8LTransformColor;

View File

@ -228,25 +228,27 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
// Add green to blue and red channels (i.e. perform the inverse transform of
// 'subtract green').
static void AddGreenToBlueAndRed(uint32_t* data, int num_pixels) {
static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
uint32_t* dst) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
uint32_t* const p_loop1_end = data + (num_pixels & ~3);
uint32_t* const p_loop2_end = data + num_pixels;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[data], %[p_loop1_end], 3f \n\t"
"beq %[src], %[p_loop1_end], 3f \n\t"
" nop \n\t"
"0: \n\t"
"lw %[temp0], 0(%[data]) \n\t"
"lw %[temp1], 4(%[data]) \n\t"
"lw %[temp2], 8(%[data]) \n\t"
"lw %[temp3], 12(%[data]) \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"lw %[temp1], 4(%[src]) \n\t"
"lw %[temp2], 8(%[src]) \n\t"
"lw %[temp3], 12(%[src]) \n\t"
"ext %[temp4], %[temp0], 8, 8 \n\t"
"ext %[temp5], %[temp1], 8, 8 \n\t"
"ext %[temp6], %[temp2], 8, 8 \n\t"
"ext %[temp7], %[temp3], 8, 8 \n\t"
"addiu %[data], %[data], 16 \n\t"
"addiu %[src], %[src], 16 \n\t"
"addiu %[dst], %[dst], 16 \n\t"
"replv.ph %[temp4], %[temp4] \n\t"
"replv.ph %[temp5], %[temp5] \n\t"
"replv.ph %[temp6], %[temp6] \n\t"
@ -255,44 +257,47 @@ static void AddGreenToBlueAndRed(uint32_t* data, int num_pixels) {
"addu.qb %[temp1], %[temp1], %[temp5] \n\t"
"addu.qb %[temp2], %[temp2], %[temp6] \n\t"
"addu.qb %[temp3], %[temp3], %[temp7] \n\t"
"sw %[temp0], -16(%[data]) \n\t"
"sw %[temp1], -12(%[data]) \n\t"
"sw %[temp2], -8(%[data]) \n\t"
"bne %[data], %[p_loop1_end], 0b \n\t"
" sw %[temp3], -4(%[data]) \n\t"
"sw %[temp0], -16(%[dst]) \n\t"
"sw %[temp1], -12(%[dst]) \n\t"
"sw %[temp2], -8(%[dst]) \n\t"
"bne %[src], %[p_loop1_end], 0b \n\t"
" sw %[temp3], -4(%[dst]) \n\t"
"3: \n\t"
"beq %[data], %[p_loop2_end], 2f \n\t"
"beq %[src], %[p_loop2_end], 2f \n\t"
" nop \n\t"
"1: \n\t"
"lw %[temp0], 0(%[data]) \n\t"
"addiu %[data], %[data], 4 \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"addiu %[src], %[src], 4 \n\t"
"addiu %[dst], %[dst], 4 \n\t"
"ext %[temp4], %[temp0], 8, 8 \n\t"
"replv.ph %[temp4], %[temp4] \n\t"
"addu.qb %[temp0], %[temp0], %[temp4] \n\t"
"bne %[data], %[p_loop2_end], 1b \n\t"
" sw %[temp0], -4(%[data]) \n\t"
"bne %[src], %[p_loop2_end], 1b \n\t"
" sw %[temp0], -4(%[dst]) \n\t"
"2: \n\t"
".set pop \n\t"
: [data]"+&r"(data), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
[temp5]"=&r"(temp5), [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
: [dst]"+&r"(dst), [src]"+&r"(src), [temp0]"=&r"(temp0),
[temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
[temp7]"=&r"(temp7)
: [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
: "memory"
);
}
static void TransformColorInverse(const VP8LMultipliers* const m,
uint32_t* data, int num_pixels) {
const uint32_t* src, int num_pixels,
uint32_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5;
uint32_t argb, argb1, new_red;
const uint32_t G_to_R = m->green_to_red_;
const uint32_t G_to_B = m->green_to_blue_;
const uint32_t R_to_B = m->red_to_blue_;
uint32_t* const p_loop_end = data + (num_pixels & ~1);
const uint32_t* const p_loop_end = src + (num_pixels & ~1);
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[data], %[p_loop_end], 1f \n\t"
"beq %[src], %[p_loop_end], 1f \n\t"
" nop \n\t"
"replv.ph %[temp0], %[G_to_R] \n\t"
"replv.ph %[temp1], %[G_to_B] \n\t"
@ -304,9 +309,12 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
"shra.ph %[temp1], %[temp1], 8 \n\t"
"shra.ph %[temp2], %[temp2], 8 \n\t"
"0: \n\t"
"lw %[argb], 0(%[data]) \n\t"
"lw %[argb1], 4(%[data]) \n\t"
"addiu %[data], %[data], 8 \n\t"
"lw %[argb], 0(%[src]) \n\t"
"lw %[argb1], 4(%[src]) \n\t"
"sw %[argb], 0(%[dst]) \n\t"
"sw %[argb1], 4(%[dst]) \n\t"
"addiu %[src], %[src], 8 \n\t"
"addiu %[dst], %[dst], 8 \n\t"
"precrq.qb.ph %[temp3], %[argb], %[argb1] \n\t"
"preceu.ph.qbra %[temp3], %[temp3] \n\t"
"shll.ph %[temp3], %[temp3], 8 \n\t"
@ -323,29 +331,29 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
"shll.ph %[temp4], %[temp5], 8 \n\t"
"shra.ph %[temp4], %[temp4], 8 \n\t"
"mul.ph %[temp4], %[temp4], %[temp2] \n\t"
"sb %[temp5], -2(%[data]) \n\t"
"sb %[temp5], -2(%[dst]) \n\t"
"sra %[temp5], %[temp5], 16 \n\t"
"shra.ph %[temp4], %[temp4], 5 \n\t"
"addu.ph %[argb1], %[argb1], %[temp4] \n\t"
"preceu.ph.qbra %[temp3], %[argb1] \n\t"
"sb %[temp5], -6(%[data]) \n\t"
"sb %[temp3], -4(%[data]) \n\t"
"sb %[temp5], -6(%[dst]) \n\t"
"sb %[temp3], -4(%[dst]) \n\t"
"sra %[temp3], %[temp3], 16 \n\t"
"bne %[data], %[p_loop_end], 0b \n\t"
" sb %[temp3], -8(%[data]) \n\t"
"bne %[src], %[p_loop_end], 0b \n\t"
" sb %[temp3], -8(%[dst]) \n\t"
"1: \n\t"
".set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[new_red]"=&r"(new_red), [argb]"=&r"(argb),
[argb1]"=&r"(argb1), [data]"+&r"(data)
[argb1]"=&r"(argb1), [dst]"+&r"(dst), [src]"+&r"(src)
: [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
[G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
: "memory", "hi", "lo"
);
// Fall-back to C-version for left-overs.
if (num_pixels & 1) VP8LTransformColorInverse_C(m, data, 1);
if (num_pixels & 1) VP8LTransformColorInverse_C(m, src, 1, dst);
}
static void ConvertBGRAToRGB(const uint32_t* src,

View File

@ -244,44 +244,51 @@ static void ConvertBGRAToRGB(const uint32_t* src,
}
}
static void AddGreenToBlueAndRed(uint32_t* data, int num_pixels) {
static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
uint32_t* dst) {
int i;
uint8_t* ptemp_data = (uint8_t*)data;
const uint8_t* in = (const uint8_t*)src;
uint8_t* out = (uint8_t*)dst;
v16u8 src0, dst0, tmp0;
const v16u8 mask = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
13, 255, 13, 255 };
while (num_pixels >= 8) {
v16u8 src1, dst1, tmp1;
LD_UB2(ptemp_data, 16, src0, src1);
LD_UB2(in, 16, src0, src1);
VSHF_B2_UB(src0, src1, src1, src0, mask, mask, tmp0, tmp1);
ADD2(src0, tmp0, src1, tmp1, dst0, dst1);
ST_UB2(dst0, dst1, ptemp_data, 16);
ptemp_data += 32;
ST_UB2(dst0, dst1, out, 16);
in += 32;
out += 32;
num_pixels -= 8;
}
if (num_pixels > 0) {
if (num_pixels >= 4) {
src0 = LD_UB(ptemp_data);
src0 = LD_UB(in);
tmp0 = VSHF_UB(src0, src0, mask);
dst0 = src0 + tmp0;
ST_UB(dst0, ptemp_data);
ptemp_data += 16;
ST_UB(dst0, out);
in += 16;
out += 16;
num_pixels -= 4;
}
for (i = 0; i < num_pixels; i++) {
const uint8_t b = ptemp_data[0];
const uint8_t g = ptemp_data[1];
const uint8_t r = ptemp_data[2];
ptemp_data[0] = (b + g) & 0xff;
ptemp_data[2] = (r + g) & 0xff;
ptemp_data += 4;
const uint8_t b = in[0];
const uint8_t g = in[1];
const uint8_t r = in[2];
out[0] = (b + g) & 0xff;
out[1] = g;
out[2] = (r + g) & 0xff;
out[4] = in[4];
out += 4;
}
}
}
static void TransformColorInverse(const VP8LMultipliers* const m,
uint32_t* data, int num_pixels) {
const uint32_t* src, int num_pixels,
uint32_t* dst) {
v16u8 src0, dst0;
const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
(m->green_to_red_ << 16));
@ -293,34 +300,36 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
while (num_pixels >= 8) {
v16u8 src1, dst1;
LD_UB2(data, 4, src0, src1);
LD_UB2(src, 4, src0, src1);
TRANSFORM_COLOR_INVERSE_8(src0, src1, dst0, dst1, g2br, r2b, mask0, mask1);
ST_UB2(dst0, dst1, data, 4);
data += 8;
ST_UB2(dst0, dst1, dst, 4);
src += 8;
dst += 8;
num_pixels -= 8;
}
if (num_pixels > 0) {
if (num_pixels >= 4) {
src0 = LD_UB(data);
src0 = LD_UB(src);
TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1);
ST_UB(dst0, data);
data += 4;
ST_UB(dst0, dst);
src += 4;
dst += 4;
num_pixels -= 4;
}
if (num_pixels > 0) {
src0 = LD_UB(data);
src0 = LD_UB(src);
TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1);
if (num_pixels == 3) {
const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2);
SD(pix_d, data + 0);
SW(pix_w, data + 2);
SD(pix_d, dst + 0);
SW(pix_w, dst + 2);
} else if (num_pixels == 2) {
const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
SD(pix_d, data);
SD(pix_d, dst);
} else {
const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 0);
SW(pix_w, data);
SW(pix_w, dst);
}
}
}

View File

@ -171,28 +171,30 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
}
#endif // USE_VTBLQ
static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
const uint32_t* const end = argb_data + (num_pixels & ~3);
static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
uint32_t* dst) {
const uint32_t* const end = src + (num_pixels & ~3);
#ifdef USE_VTBLQ
const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
#else
const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
#endif
for (; argb_data < end; argb_data += 4) {
const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
for (; src < end; src += 4, dst += 4) {
const uint8x16_t argb = vld1q_u8((const uint8_t*)src);
const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens));
vst1q_u8((uint8_t*)dst, vaddq_u8(argb, greens));
}
// fallthrough and finish off with plain-C
VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3);
VP8LAddGreenToBlueAndRed_C(src, num_pixels & 3, dst);
}
//------------------------------------------------------------------------------
// Color Transform
static void TransformColorInverse(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
// sign-extended multiplying constants, pre-shifted by 6.
const uint32_t* const src, int num_pixels,
uint32_t* dst) {
// sign-extended multiplying constants, pre-shifted by 6.
#define CST(X) (((int16_t)(m->X << 8)) >> 6)
const int16_t rb[8] = {
CST(green_to_blue_), CST(green_to_red_),
@ -219,7 +221,7 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
const uint32x4_t mask_ag = vdupq_n_u32(0xff00ff00u);
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i));
const uint8x16_t in = vld1q_u8((const uint8_t*)(src + i));
const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag);
// 0 g 0 g
const uint8x16_t greens = DoGreenShuffle(in, shuffle);
@ -240,10 +242,10 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
// 0 r' 0 b''
const uint16x8_t G = vshrq_n_u16(vreinterpretq_u16_s8(F), 8);
const uint32x4_t out = vorrq_u32(vreinterpretq_u32_u16(G), a0g0);
vst1q_u32(argb_data + i, out);
vst1q_u32(dst + i, out);
}
// Fall-back to C-version for left-overs.
VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i);
VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
}
#undef USE_VTBLQ

View File

@ -157,26 +157,28 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
//------------------------------------------------------------------------------
// Subtract-Green Transform
static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
uint32_t* dst) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g
const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g
const __m128i out = _mm_add_epi8(in, C);
_mm_storeu_si128((__m128i*)&argb_data[i], out);
_mm_storeu_si128((__m128i*)&dst[i], out);
}
// fallthrough and finish off with plain-C
VP8LAddGreenToBlueAndRed_C(argb_data + i, num_pixels - i);
VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i);
}
//------------------------------------------------------------------------------
// Color Transform
static void TransformColorInverse(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
// sign-extended multiplying constants, pre-shifted by 5.
const uint32_t* const src, int num_pixels,
uint32_t* dst) {
// sign-extended multiplying constants, pre-shifted by 5.
#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
const __m128i mults_rb = _mm_set_epi16(
CST(green_to_red_), CST(green_to_blue_),
@ -190,7 +192,7 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0
const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0
@ -202,10 +204,10 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0
const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b''
const __m128i out = _mm_or_si128(J, A);
_mm_storeu_si128((__m128i*)&argb_data[i], out);
_mm_storeu_si128((__m128i*)&dst[i], out);
}
// Fall-back to C-version for left-overs.
VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i);
VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
}
//------------------------------------------------------------------------------