add some colorspace conversion functions in NEON
new file: lossless_neon.c speedup is ~5% gcc 4.6.3 seems to be doing some sub-optimal things here, storing register on stack using 'vstmia' and such. Looks similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=51509 I've tried adding -fno-split-wide-types and it does help the generated assembly. But the overall speed gets worse with this flag. We should only compile lossless_neon.c with it -> urk. Change-Id: I2ccc0929f5ef9dfb0105960e65c0b79b5f18d3b0
This commit is contained in:
parent
daccbf400d
commit
97e5fac389
@ -79,8 +79,9 @@ ifneq ($(findstring armeabi-v7a, $(TARGET_ARCH_ABI)),)
|
|||||||
# instructions to be generated for armv7a code. Instead target the neon code
|
# instructions to be generated for armv7a code. Instead target the neon code
|
||||||
# specifically.
|
# specifically.
|
||||||
LOCAL_SRC_FILES += src/dsp/dec_neon.c.neon
|
LOCAL_SRC_FILES += src/dsp/dec_neon.c.neon
|
||||||
LOCAL_SRC_FILES += src/dsp/upsampling_neon.c.neon
|
|
||||||
LOCAL_SRC_FILES += src/dsp/enc_neon.c.neon
|
LOCAL_SRC_FILES += src/dsp/enc_neon.c.neon
|
||||||
|
LOCAL_SRC_FILES += src/dsp/lossless_neon.c.neon
|
||||||
|
LOCAL_SRC_FILES += src/dsp/upsampling_neon.c.neon
|
||||||
endif
|
endif
|
||||||
LOCAL_STATIC_LIBRARIES := cpufeatures
|
LOCAL_STATIC_LIBRARIES := cpufeatures
|
||||||
|
|
||||||
|
@ -174,6 +174,7 @@ DSP_DEC_OBJS = \
|
|||||||
$(DIROBJ)\dsp\dec_neon.obj \
|
$(DIROBJ)\dsp\dec_neon.obj \
|
||||||
$(DIROBJ)\dsp\dec_sse2.obj \
|
$(DIROBJ)\dsp\dec_sse2.obj \
|
||||||
$(DIROBJ)\dsp\lossless.obj \
|
$(DIROBJ)\dsp\lossless.obj \
|
||||||
|
$(DIROBJ)\dsp\lossless_neon.obj \
|
||||||
$(DIROBJ)\dsp\lossless_sse2.obj \
|
$(DIROBJ)\dsp\lossless_sse2.obj \
|
||||||
$(DIROBJ)\dsp\upsampling.obj \
|
$(DIROBJ)\dsp\upsampling.obj \
|
||||||
$(DIROBJ)\dsp\upsampling_mips32.obj \
|
$(DIROBJ)\dsp\upsampling_mips32.obj \
|
||||||
|
@ -69,6 +69,10 @@ EXTRA_FLAGS += -Wdeclaration-after-statement
|
|||||||
EXTRA_FLAGS += -Wshadow
|
EXTRA_FLAGS += -Wshadow
|
||||||
# EXTRA_FLAGS += -Wvla
|
# EXTRA_FLAGS += -Wvla
|
||||||
|
|
||||||
|
# NEON-specific flags:
|
||||||
|
# EXTRA_FLAGS += -march=armv7-a -mfloat-abi=hard -mfpu=neon -mtune=cortex-a8
|
||||||
|
# -> seems to make the overall lib slower: -fno-split-wide-types
|
||||||
|
|
||||||
#### Nothing should normally be changed below this line ####
|
#### Nothing should normally be changed below this line ####
|
||||||
|
|
||||||
AR = ar
|
AR = ar
|
||||||
@ -105,6 +109,7 @@ DSP_DEC_OBJS = \
|
|||||||
src/dsp/dec_neon.o \
|
src/dsp/dec_neon.o \
|
||||||
src/dsp/dec_sse2.o \
|
src/dsp/dec_sse2.o \
|
||||||
src/dsp/lossless.o \
|
src/dsp/lossless.o \
|
||||||
|
src/dsp/lossless_neon.o \
|
||||||
src/dsp/lossless_sse2.o \
|
src/dsp/lossless_sse2.o \
|
||||||
src/dsp/upsampling.o \
|
src/dsp/upsampling.o \
|
||||||
src/dsp/upsampling_mips32.o \
|
src/dsp/upsampling_mips32.o \
|
||||||
|
@ -17,6 +17,7 @@ COMMON_SOURCES += dec_neon.c
|
|||||||
COMMON_SOURCES += dec_sse2.c
|
COMMON_SOURCES += dec_sse2.c
|
||||||
COMMON_SOURCES += dsp.h
|
COMMON_SOURCES += dsp.h
|
||||||
COMMON_SOURCES += lossless.c
|
COMMON_SOURCES += lossless.c
|
||||||
|
COMMON_SOURCES += lossless_neon.c
|
||||||
COMMON_SOURCES += lossless_sse2.c
|
COMMON_SOURCES += lossless_sse2.c
|
||||||
COMMON_SOURCES += lossless.h
|
COMMON_SOURCES += lossless.h
|
||||||
COMMON_SOURCES += upsampling.c
|
COMMON_SOURCES += upsampling.c
|
||||||
|
@ -1475,6 +1475,7 @@ VP8LConvertFunc VP8LConvertBGRAToRGB565;
|
|||||||
VP8LConvertFunc VP8LConvertBGRAToBGR;
|
VP8LConvertFunc VP8LConvertBGRAToBGR;
|
||||||
|
|
||||||
extern void VP8LDspInitSSE2(void);
|
extern void VP8LDspInitSSE2(void);
|
||||||
|
extern void VP8LDspInitNEON(void);
|
||||||
|
|
||||||
void VP8LDspInit(void) {
|
void VP8LDspInit(void) {
|
||||||
memcpy(VP8LPredictors, kPredictorsC, sizeof(VP8LPredictors));
|
memcpy(VP8LPredictors, kPredictorsC, sizeof(VP8LPredictors));
|
||||||
@ -1494,6 +1495,11 @@ void VP8LDspInit(void) {
|
|||||||
if (VP8GetCPUInfo(kSSE2)) {
|
if (VP8GetCPUInfo(kSSE2)) {
|
||||||
VP8LDspInitSSE2();
|
VP8LDspInitSSE2();
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#if defined(WEBP_USE_NEON)
|
||||||
|
if (VP8GetCPUInfo(kNEON)) {
|
||||||
|
VP8LDspInitNEON();
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
81
src/dsp/lossless_neon.c
Normal file
81
src/dsp/lossless_neon.c
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
// Copyright 2014 Google Inc. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Use of this source code is governed by a BSD-style license
|
||||||
|
// that can be found in the COPYING file in the root of the source
|
||||||
|
// tree. An additional intellectual property rights grant can be found
|
||||||
|
// in the file PATENTS. All contributing project authors may
|
||||||
|
// be found in the AUTHORS file in the root of the source tree.
|
||||||
|
// -----------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// NEON variant of methods for lossless decoder
|
||||||
|
//
|
||||||
|
// Author: Skal (pascal.massimino@gmail.com)
|
||||||
|
|
||||||
|
#include "./dsp.h"
|
||||||
|
|
||||||
|
#if defined(WEBP_USE_NEON)
|
||||||
|
|
||||||
|
#include <arm_neon.h>
|
||||||
|
|
||||||
|
#include "./lossless.h"
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
// Colorspace conversion functions
|
||||||
|
|
||||||
|
static void ConvertBGRAToRGBA(const uint32_t* src,
|
||||||
|
int num_pixels, uint8_t* dst) {
|
||||||
|
const uint32_t* const end = src + num_pixels - 16;
|
||||||
|
for (; src <= end; src += 16) {
|
||||||
|
uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
|
||||||
|
// swap B and R. (VSWP d0,d2 has no intrinsics equivalent!)
|
||||||
|
const uint8x16_t tmp = pixel.val[0];
|
||||||
|
pixel.val[0] = pixel.val[2];
|
||||||
|
pixel.val[2] = tmp;
|
||||||
|
vst4q_u8(dst, pixel);
|
||||||
|
dst += 64;
|
||||||
|
}
|
||||||
|
num_pixels &= 15;
|
||||||
|
VP8LConvertBGRAToRGBA_C(src, num_pixels, dst); // left-overs
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ConvertBGRAToBGR(const uint32_t* src,
|
||||||
|
int num_pixels, uint8_t* dst) {
|
||||||
|
const uint32_t* const end = src + num_pixels - 16;
|
||||||
|
for (; src <= end; src += 16) {
|
||||||
|
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
|
||||||
|
const uint8x16x3_t tmp = { { pixel.val[0], pixel.val[1], pixel.val[2] } };
|
||||||
|
vst3q_u8(dst, tmp);
|
||||||
|
dst += 48;
|
||||||
|
}
|
||||||
|
num_pixels &= 15;
|
||||||
|
VP8LConvertBGRAToBGR_C(src, num_pixels, dst); // left-overs
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ConvertBGRAToRGB(const uint32_t* src,
|
||||||
|
int num_pixels, uint8_t* dst) {
|
||||||
|
const uint32_t* const end = src + num_pixels - 16;
|
||||||
|
for (; src <= end; src += 16) {
|
||||||
|
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
|
||||||
|
const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } };
|
||||||
|
vst3q_u8(dst, tmp);
|
||||||
|
dst += 48;
|
||||||
|
}
|
||||||
|
num_pixels &= 15;
|
||||||
|
VP8LConvertBGRAToRGB_C(src, num_pixels, dst); // left-overs
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // WEBP_USE_NEON
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
extern void VP8LDspInitNEON(void);
|
||||||
|
|
||||||
|
void VP8LDspInitNEON(void) {
|
||||||
|
#if defined(WEBP_USE_NEON)
|
||||||
|
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
|
||||||
|
VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
|
||||||
|
VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
|
||||||
|
#endif // WEBP_USE_NEON
|
||||||
|
}
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------
|
Loading…
x
Reference in New Issue
Block a user