lossless_neon: enable subtract green for aarch64
similar to: 1ba61b0 enable NEON intrinsics in aarch64 builds vtbl1_u8 is available everywhere but Xcode-based iOS arm64 builds, use vtbl1q_u8 there. performance varies based on the input, 1-3% on encode was observed Change-Id: Ifec35b37eb856acfcf69ed7f16fa078cd40b7034
This commit is contained in:
parent
72831f6b28
commit
416e1cea9b
@ -259,20 +259,44 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
|
||||
//------------------------------------------------------------------------------
|
||||
// Subtract-Green Transform
|
||||
|
||||
// vtbl? are unavailable in iOS/arm64 builds.
|
||||
#if !defined(__aarch64__)
|
||||
// vtbl?_u8 are marked unavailable for iOS arm64, use wider versions there.
|
||||
#if defined(__APPLE__) && defined(__aarch64__) && \
|
||||
defined(__apple_build_version__)
|
||||
#define USE_VTBLQ
|
||||
#endif
|
||||
|
||||
// 255 = byte will be zero'd
|
||||
#ifdef USE_VTBLQ
|
||||
// 255 = byte will be zeroed
|
||||
static const uint8_t kGreenShuffle[16] = {
|
||||
1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
|
||||
};
|
||||
|
||||
static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
|
||||
const uint8x16_t shuffle) {
|
||||
return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
|
||||
vtbl1q_u8(argb, vget_high_u8(shuffle)));
|
||||
}
|
||||
#else // !USE_VTBLQ
|
||||
// 255 = byte will be zeroed
|
||||
static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };
|
||||
|
||||
static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
|
||||
const uint8x8_t shuffle) {
|
||||
return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
|
||||
vtbl1_u8(vget_high_u8(argb), shuffle));
|
||||
}
|
||||
#endif // USE_VTBLQ
|
||||
|
||||
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
|
||||
const uint32_t* const end = argb_data + (num_pixels & ~3);
|
||||
#ifdef USE_VTBLQ
|
||||
const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
|
||||
#else
|
||||
const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
|
||||
#endif
|
||||
for (; argb_data < end; argb_data += 4) {
|
||||
const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
|
||||
const uint8x16_t greens =
|
||||
vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
|
||||
vtbl1_u8(vget_high_u8(argb), shuffle));
|
||||
const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
|
||||
vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
|
||||
}
|
||||
// fallthrough and finish off with plain-C
|
||||
@ -281,19 +305,21 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
|
||||
|
||||
static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
|
||||
const uint32_t* const end = argb_data + (num_pixels & ~3);
|
||||
#ifdef USE_VTBLQ
|
||||
const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
|
||||
#else
|
||||
const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
|
||||
#endif
|
||||
for (; argb_data < end; argb_data += 4) {
|
||||
const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
|
||||
const uint8x16_t greens =
|
||||
vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
|
||||
vtbl1_u8(vget_high_u8(argb), shuffle));
|
||||
const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
|
||||
vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens));
|
||||
}
|
||||
// fallthrough and finish off with plain-C
|
||||
VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3);
|
||||
}
|
||||
|
||||
#endif // !__aarch64__
|
||||
#undef USE_VTBLQ
|
||||
|
||||
#endif // WEBP_USE_INTRINSICS
|
||||
|
||||
@ -320,11 +346,9 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) {
|
||||
VP8LPredictors[12] = Predictor12;
|
||||
VP8LPredictors[13] = Predictor13;
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
|
||||
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif // WEBP_USE_NEON
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user