add some colorspace conversion functions in NEON
new file: lossless_neon.c speedup is ~5% gcc 4.6.3 seems to be doing some sub-optimal things here, storing register on stack using 'vstmia' and such. Looks similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=51509 I've tried adding -fno-split-wide-types and it does help the generated assembly. But the overall speed gets worse with this flag. We should only compile lossless_neon.c with it -> urk. Change-Id: I2ccc0929f5ef9dfb0105960e65c0b79b5f18d3b0
This commit is contained in:
parent
daccbf400d
commit
97e5fac389
@ -79,8 +79,9 @@ ifneq ($(findstring armeabi-v7a, $(TARGET_ARCH_ABI)),)
|
||||
# instructions to be generated for armv7a code. Instead target the neon code
|
||||
# specifically.
|
||||
LOCAL_SRC_FILES += src/dsp/dec_neon.c.neon
|
||||
LOCAL_SRC_FILES += src/dsp/upsampling_neon.c.neon
|
||||
LOCAL_SRC_FILES += src/dsp/enc_neon.c.neon
|
||||
LOCAL_SRC_FILES += src/dsp/lossless_neon.c.neon
|
||||
LOCAL_SRC_FILES += src/dsp/upsampling_neon.c.neon
|
||||
endif
|
||||
LOCAL_STATIC_LIBRARIES := cpufeatures
|
||||
|
||||
|
@ -174,6 +174,7 @@ DSP_DEC_OBJS = \
|
||||
$(DIROBJ)\dsp\dec_neon.obj \
|
||||
$(DIROBJ)\dsp\dec_sse2.obj \
|
||||
$(DIROBJ)\dsp\lossless.obj \
|
||||
$(DIROBJ)\dsp\lossless_neon.obj \
|
||||
$(DIROBJ)\dsp\lossless_sse2.obj \
|
||||
$(DIROBJ)\dsp\upsampling.obj \
|
||||
$(DIROBJ)\dsp\upsampling_mips32.obj \
|
||||
|
@ -69,6 +69,10 @@ EXTRA_FLAGS += -Wdeclaration-after-statement
|
||||
EXTRA_FLAGS += -Wshadow
|
||||
# EXTRA_FLAGS += -Wvla
|
||||
|
||||
# NEON-specific flags:
|
||||
# EXTRA_FLAGS += -march=armv7-a -mfloat-abi=hard -mfpu=neon -mtune=cortex-a8
|
||||
# -> seems to make the overall lib slower: -fno-split-wide-types
|
||||
|
||||
#### Nothing should normally be changed below this line ####
|
||||
|
||||
AR = ar
|
||||
@ -105,6 +109,7 @@ DSP_DEC_OBJS = \
|
||||
src/dsp/dec_neon.o \
|
||||
src/dsp/dec_sse2.o \
|
||||
src/dsp/lossless.o \
|
||||
src/dsp/lossless_neon.o \
|
||||
src/dsp/lossless_sse2.o \
|
||||
src/dsp/upsampling.o \
|
||||
src/dsp/upsampling_mips32.o \
|
||||
|
@ -17,6 +17,7 @@ COMMON_SOURCES += dec_neon.c
|
||||
COMMON_SOURCES += dec_sse2.c
|
||||
COMMON_SOURCES += dsp.h
|
||||
COMMON_SOURCES += lossless.c
|
||||
COMMON_SOURCES += lossless_neon.c
|
||||
COMMON_SOURCES += lossless_sse2.c
|
||||
COMMON_SOURCES += lossless.h
|
||||
COMMON_SOURCES += upsampling.c
|
||||
|
@ -1475,6 +1475,7 @@ VP8LConvertFunc VP8LConvertBGRAToRGB565;
|
||||
VP8LConvertFunc VP8LConvertBGRAToBGR;
|
||||
|
||||
extern void VP8LDspInitSSE2(void);
|
||||
extern void VP8LDspInitNEON(void);
|
||||
|
||||
void VP8LDspInit(void) {
|
||||
memcpy(VP8LPredictors, kPredictorsC, sizeof(VP8LPredictors));
|
||||
@ -1494,6 +1495,11 @@ void VP8LDspInit(void) {
|
||||
if (VP8GetCPUInfo(kSSE2)) {
|
||||
VP8LDspInitSSE2();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_NEON)
|
||||
if (VP8GetCPUInfo(kNEON)) {
|
||||
VP8LDspInitNEON();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
81
src/dsp/lossless_neon.c
Normal file
81
src/dsp/lossless_neon.c
Normal file
@ -0,0 +1,81 @@
|
||||
// Copyright 2014 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// NEON variant of methods for lossless decoder
|
||||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "./lossless.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Colorspace conversion functions
|
||||
|
||||
static void ConvertBGRAToRGBA(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const uint32_t* const end = src + num_pixels - 16;
|
||||
for (; src <= end; src += 16) {
|
||||
uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
|
||||
// swap B and R. (VSWP d0,d2 has no intrinsics equivalent!)
|
||||
const uint8x16_t tmp = pixel.val[0];
|
||||
pixel.val[0] = pixel.val[2];
|
||||
pixel.val[2] = tmp;
|
||||
vst4q_u8(dst, pixel);
|
||||
dst += 64;
|
||||
}
|
||||
num_pixels &= 15;
|
||||
VP8LConvertBGRAToRGBA_C(src, num_pixels, dst); // left-overs
|
||||
}
|
||||
|
||||
static void ConvertBGRAToBGR(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const uint32_t* const end = src + num_pixels - 16;
|
||||
for (; src <= end; src += 16) {
|
||||
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
|
||||
const uint8x16x3_t tmp = { { pixel.val[0], pixel.val[1], pixel.val[2] } };
|
||||
vst3q_u8(dst, tmp);
|
||||
dst += 48;
|
||||
}
|
||||
num_pixels &= 15;
|
||||
VP8LConvertBGRAToBGR_C(src, num_pixels, dst); // left-overs
|
||||
}
|
||||
|
||||
static void ConvertBGRAToRGB(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const uint32_t* const end = src + num_pixels - 16;
|
||||
for (; src <= end; src += 16) {
|
||||
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
|
||||
const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } };
|
||||
vst3q_u8(dst, tmp);
|
||||
dst += 48;
|
||||
}
|
||||
num_pixels &= 15;
|
||||
VP8LConvertBGRAToRGB_C(src, num_pixels, dst); // left-overs
|
||||
}
|
||||
|
||||
#endif // WEBP_USE_NEON
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
extern void VP8LDspInitNEON(void);
|
||||
|
||||
void VP8LDspInitNEON(void) {
|
||||
#if defined(WEBP_USE_NEON)
|
||||
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
|
||||
VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
|
||||
VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
|
||||
#endif // WEBP_USE_NEON
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
Loading…
x
Reference in New Issue
Block a user