Merge pull request #2472 from saamas/processing-x86-general-bilinear-downsample-optimizations

[Processing/x86] GeneralBilinearDownsample optimizations
This commit is contained in:
ruil2 2016-05-27 15:17:31 +08:00
commit 39c2fb3d6b
6 changed files with 2353 additions and 54 deletions

View File

@ -478,6 +478,12 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ; Mark the stack as non-
%endif
%endmacro
%macro ZERO_EXTENSION 1
%ifndef X86_32
mov dword %1, %1
%endif
%endmacro
%macro WELS_EXTERN 1
ALIGN 16
%ifdef PREFIX

View File

@ -100,12 +100,18 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_ssse3;
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_ssse3;
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_ssse3;
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsamplerWrap_ssse3;
}
if (iCpuFlag & WELS_CPU_SSE41) {
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse4;
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse4;
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_sse4;
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse4;
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse41;
}
if (iCpuFlag & WELS_CPU_AVX2) {
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_avx2;
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsamplerWrap_avx2;
}
#endif//X86_ASM

View File

@ -101,6 +101,10 @@ HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_sse4;
GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_ssse3;
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse41;
GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_avx2;
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_avx2;
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_ssse3;
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_sse4;
@ -114,6 +118,18 @@ void GeneralBilinearFastDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStri
void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX,
const uint32_t kuiScaleY);
void GeneralBilinearFastDownsampler_ssse3 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
uint32_t uiScaleY);
void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
uint32_t uiScaleY);
void GeneralBilinearFastDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
uint32_t uiScaleY);
void GeneralBilinearAccurateDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
uint32_t uiScaleY);
WELSVP_EXTERN_C_END
#endif

View File

@ -247,58 +247,52 @@ void GeneralBilinearAccurateDownsampler_c (uint8_t* pDst, const int32_t kiDstStr
}
}
#ifdef X86_ASM
void GeneralBilinearFastDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
#if defined(X86_ASM) || defined(HAVE_NEON) || defined(HAVE_NEON_AARCH64)
static void GeneralBilinearDownsamplerWrap (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
const int32_t kiDstHeight,
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
const int32_t kiScaleBitWidth = 16, kiScaleBitHeight = 15;
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
const int32_t kiScaleBitWidth, const int32_t kiScaleBitHeight,
void (*func) (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth, int32_t iDstHeight,
uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX, uint32_t uiScaleY)) {
const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight);
uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScaleWidth);
uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScaleHeight);
GeneralBilinearFastDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
pSrc, kiSrcStride, uiScalex, uiScaley);
func (pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride, uiScalex, uiScaley);
}
void GeneralBilinearAccurateDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
const int32_t kiDstHeight,
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
const int32_t kiScaleBit = 15;
const uint32_t kuiScale = (1 << kiScaleBit);
#define DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP(suffix) \
void GeneralBilinearFastDownsamplerWrap_ ## suffix ( \
uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, \
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) { \
GeneralBilinearDownsamplerWrap (pDst, kiDstStride, kiDstWidth, kiDstHeight, \
pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, 16, 15, GeneralBilinearFastDownsampler_ ## suffix); \
}
uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
#define DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP(suffix) \
void GeneralBilinearAccurateDownsamplerWrap_ ## suffix ( \
uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, \
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) { \
GeneralBilinearDownsamplerWrap (pDst, kiDstStride, kiDstWidth, kiDstHeight, \
pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, 15, 15, GeneralBilinearAccurateDownsampler_ ## suffix); \
}
#endif
GeneralBilinearAccurateDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
pSrc, kiSrcStride, uiScalex, uiScaley);
}
#ifdef X86_ASM
DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (sse2)
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse2)
DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (ssse3)
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse41)
DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (avx2)
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (avx2)
#endif //X86_ASM
#ifdef HAVE_NEON
void GeneralBilinearAccurateDownsamplerWrap_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
const int32_t kiDstHeight,
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
const int32_t kiScaleBit = 15;
const uint32_t kuiScale = (1 << kiScaleBit);
uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
GeneralBilinearAccurateDownsampler_neon (pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride, uiScalex,
uiScaley);
}
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (neon)
#endif
#ifdef HAVE_NEON_AARCH64
void GeneralBilinearAccurateDownsamplerWrap_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride,
const int32_t kiDstWidth, const int32_t kiDstHeight,
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
const int32_t kiScaleBit = 15;
const uint32_t kuiScale = (1 << kiScaleBit);
uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
GeneralBilinearAccurateDownsampler_AArch64_neon (pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride,
uiScalex, uiScaley);
}
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (AArch64_neon)
#endif
WELSVP_NAMESPACE_END

File diff suppressed because it is too large Load Diff

View File

@ -296,21 +296,23 @@ TEST (DownSampleTest, func) { \
int src_stride_a; \
int src_width_a; \
int src_height_a; \
dst_stride_c = dst_stride_a = 320; \
src_stride_c = src_stride_a = 320; \
src_width_c = src_width_a = 320; \
src_height_c = src_height_a = 180; \
dst_width_c = dst_width_a = 300; \
dst_height_c = dst_height_a = 160; \
for (int j = 0; j < 70000; j++) { \
dst_c[j] = dst_a[j] = rand() % 256; \
src_c[j] = src_a[j] = rand() % 256; \
} \
ref (dst_c, dst_stride_c, dst_width_c, dst_height_c, src_c, src_stride_c, src_width_c, src_height_c); \
func (dst_a, dst_stride_a, dst_width_a, dst_height_a, src_a, src_stride_a, src_width_a, src_height_a); \
for (int j = 0; j < dst_height_c; j++) { \
for (int m = 0; m < dst_width_c ; m++) { \
ASSERT_EQ (dst_c[m + j * dst_stride_c], dst_a[m + j * dst_stride_a]); \
for (int i = 0; i < 5; i++) { \
dst_stride_c = dst_stride_a = 320; \
src_stride_c = src_stride_a = 320; \
src_width_c = src_width_a = 320; \
src_height_c = src_height_a = 180; \
dst_width_c = dst_width_a = (src_width_c >> (i + 1)) + rand() % (src_width_c >> (i + 1)); \
dst_height_c = dst_height_a = (src_height_c >> (i + 1)) + rand() % (src_height_c >> (i + 1)); \
for (int j = 0; j < 70000; j++) { \
dst_c[j] = dst_a[j] = rand() % 256; \
src_c[j] = src_a[j] = rand() % 256; \
} \
ref (dst_c, dst_stride_c, dst_width_c, dst_height_c, src_c, src_stride_c, src_width_c, src_height_c); \
func (dst_a, dst_stride_a, dst_width_a, dst_height_a, src_a, src_stride_a, src_width_a, src_height_a); \
for (int j = 0; j < dst_height_c; j++) { \
for (int m = 0; m < dst_width_c ; m++) { \
ASSERT_EQ (dst_c[m + j * dst_stride_c], dst_a[m + j * dst_stride_a]); \
} \
} \
} \
}
@ -343,6 +345,14 @@ GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_sse2,
WELS_CPU_SSE2)
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse2,
GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_SSE2)
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_ssse3, GeneralBilinearFastDownsampler_ref, 1,
WELS_CPU_SSSE3)
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse41,
GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_SSE41)
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_avx2, GeneralBilinearFastDownsampler_ref, 1,
WELS_CPU_AVX2)
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_avx2,
GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_AVX2)
#endif
#if defined(HAVE_NEON)