Merge pull request #2472 from saamas/processing-x86-general-bilinear-downsample-optimizations
[Processing/x86] GeneralBilinearDownsample optimizations
This commit is contained in:
commit
39c2fb3d6b
@ -478,6 +478,12 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ; Mark the stack as non-
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro ZERO_EXTENSION 1
|
||||
%ifndef X86_32
|
||||
mov dword %1, %1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro WELS_EXTERN 1
|
||||
ALIGN 16
|
||||
%ifdef PREFIX
|
||||
|
@ -100,12 +100,18 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
|
||||
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_ssse3;
|
||||
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_ssse3;
|
||||
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_ssse3;
|
||||
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsamplerWrap_ssse3;
|
||||
}
|
||||
if (iCpuFlag & WELS_CPU_SSE41) {
|
||||
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse4;
|
||||
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse4;
|
||||
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_sse4;
|
||||
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse4;
|
||||
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse41;
|
||||
}
|
||||
if (iCpuFlag & WELS_CPU_AVX2) {
|
||||
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_avx2;
|
||||
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsamplerWrap_avx2;
|
||||
}
|
||||
#endif//X86_ASM
|
||||
|
||||
|
@ -101,6 +101,10 @@ HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_sse4;
|
||||
|
||||
GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
|
||||
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
|
||||
GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_ssse3;
|
||||
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse41;
|
||||
GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_avx2;
|
||||
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_avx2;
|
||||
|
||||
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_ssse3;
|
||||
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_sse4;
|
||||
@ -114,6 +118,18 @@ void GeneralBilinearFastDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStri
|
||||
void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
|
||||
const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX,
|
||||
const uint32_t kuiScaleY);
|
||||
void GeneralBilinearFastDownsampler_ssse3 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
|
||||
int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
|
||||
uint32_t uiScaleY);
|
||||
void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
|
||||
int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
|
||||
uint32_t uiScaleY);
|
||||
void GeneralBilinearFastDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
|
||||
int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
|
||||
uint32_t uiScaleY);
|
||||
void GeneralBilinearAccurateDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
|
||||
int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
|
||||
uint32_t uiScaleY);
|
||||
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
@ -247,58 +247,52 @@ void GeneralBilinearAccurateDownsampler_c (uint8_t* pDst, const int32_t kiDstStr
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#ifdef X86_ASM
|
||||
void GeneralBilinearFastDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
|
||||
#if defined(X86_ASM) || defined(HAVE_NEON) || defined(HAVE_NEON_AARCH64)
|
||||
static void GeneralBilinearDownsamplerWrap (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
|
||||
const int32_t kiDstHeight,
|
||||
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
|
||||
const int32_t kiScaleBitWidth = 16, kiScaleBitHeight = 15;
|
||||
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
|
||||
const int32_t kiScaleBitWidth, const int32_t kiScaleBitHeight,
|
||||
void (*func) (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth, int32_t iDstHeight,
|
||||
uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX, uint32_t uiScaleY)) {
|
||||
const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight);
|
||||
|
||||
uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScaleWidth);
|
||||
uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScaleHeight);
|
||||
|
||||
GeneralBilinearFastDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
|
||||
pSrc, kiSrcStride, uiScalex, uiScaley);
|
||||
func (pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride, uiScalex, uiScaley);
|
||||
}
|
||||
|
||||
void GeneralBilinearAccurateDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
|
||||
const int32_t kiDstHeight,
|
||||
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
|
||||
const int32_t kiScaleBit = 15;
|
||||
const uint32_t kuiScale = (1 << kiScaleBit);
|
||||
#define DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP(suffix) \
|
||||
void GeneralBilinearFastDownsamplerWrap_ ## suffix ( \
|
||||
uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, \
|
||||
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) { \
|
||||
GeneralBilinearDownsamplerWrap (pDst, kiDstStride, kiDstWidth, kiDstHeight, \
|
||||
pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, 16, 15, GeneralBilinearFastDownsampler_ ## suffix); \
|
||||
}
|
||||
|
||||
uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
|
||||
uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
|
||||
#define DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP(suffix) \
|
||||
void GeneralBilinearAccurateDownsamplerWrap_ ## suffix ( \
|
||||
uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, \
|
||||
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) { \
|
||||
GeneralBilinearDownsamplerWrap (pDst, kiDstStride, kiDstWidth, kiDstHeight, \
|
||||
pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, 15, 15, GeneralBilinearAccurateDownsampler_ ## suffix); \
|
||||
}
|
||||
#endif
|
||||
|
||||
GeneralBilinearAccurateDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
|
||||
pSrc, kiSrcStride, uiScalex, uiScaley);
|
||||
}
|
||||
#ifdef X86_ASM
|
||||
DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (sse2)
|
||||
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse2)
|
||||
DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (ssse3)
|
||||
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse41)
|
||||
DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (avx2)
|
||||
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (avx2)
|
||||
#endif //X86_ASM
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
void GeneralBilinearAccurateDownsamplerWrap_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
|
||||
const int32_t kiDstHeight,
|
||||
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
|
||||
const int32_t kiScaleBit = 15;
|
||||
const uint32_t kuiScale = (1 << kiScaleBit);
|
||||
uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
|
||||
uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
|
||||
GeneralBilinearAccurateDownsampler_neon (pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride, uiScalex,
|
||||
uiScaley);
|
||||
}
|
||||
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (neon)
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON_AARCH64
|
||||
void GeneralBilinearAccurateDownsamplerWrap_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride,
|
||||
const int32_t kiDstWidth, const int32_t kiDstHeight,
|
||||
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
|
||||
const int32_t kiScaleBit = 15;
|
||||
const uint32_t kuiScale = (1 << kiScaleBit);
|
||||
uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
|
||||
uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
|
||||
GeneralBilinearAccurateDownsampler_AArch64_neon (pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride,
|
||||
uiScalex, uiScaley);
|
||||
}
|
||||
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (AArch64_neon)
|
||||
#endif
|
||||
WELSVP_NAMESPACE_END
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -296,21 +296,23 @@ TEST (DownSampleTest, func) { \
|
||||
int src_stride_a; \
|
||||
int src_width_a; \
|
||||
int src_height_a; \
|
||||
dst_stride_c = dst_stride_a = 320; \
|
||||
src_stride_c = src_stride_a = 320; \
|
||||
src_width_c = src_width_a = 320; \
|
||||
src_height_c = src_height_a = 180; \
|
||||
dst_width_c = dst_width_a = 300; \
|
||||
dst_height_c = dst_height_a = 160; \
|
||||
for (int j = 0; j < 70000; j++) { \
|
||||
dst_c[j] = dst_a[j] = rand() % 256; \
|
||||
src_c[j] = src_a[j] = rand() % 256; \
|
||||
} \
|
||||
ref (dst_c, dst_stride_c, dst_width_c, dst_height_c, src_c, src_stride_c, src_width_c, src_height_c); \
|
||||
func (dst_a, dst_stride_a, dst_width_a, dst_height_a, src_a, src_stride_a, src_width_a, src_height_a); \
|
||||
for (int j = 0; j < dst_height_c; j++) { \
|
||||
for (int m = 0; m < dst_width_c ; m++) { \
|
||||
ASSERT_EQ (dst_c[m + j * dst_stride_c], dst_a[m + j * dst_stride_a]); \
|
||||
for (int i = 0; i < 5; i++) { \
|
||||
dst_stride_c = dst_stride_a = 320; \
|
||||
src_stride_c = src_stride_a = 320; \
|
||||
src_width_c = src_width_a = 320; \
|
||||
src_height_c = src_height_a = 180; \
|
||||
dst_width_c = dst_width_a = (src_width_c >> (i + 1)) + rand() % (src_width_c >> (i + 1)); \
|
||||
dst_height_c = dst_height_a = (src_height_c >> (i + 1)) + rand() % (src_height_c >> (i + 1)); \
|
||||
for (int j = 0; j < 70000; j++) { \
|
||||
dst_c[j] = dst_a[j] = rand() % 256; \
|
||||
src_c[j] = src_a[j] = rand() % 256; \
|
||||
} \
|
||||
ref (dst_c, dst_stride_c, dst_width_c, dst_height_c, src_c, src_stride_c, src_width_c, src_height_c); \
|
||||
func (dst_a, dst_stride_a, dst_width_a, dst_height_a, src_a, src_stride_a, src_width_a, src_height_a); \
|
||||
for (int j = 0; j < dst_height_c; j++) { \
|
||||
for (int m = 0; m < dst_width_c ; m++) { \
|
||||
ASSERT_EQ (dst_c[m + j * dst_stride_c], dst_a[m + j * dst_stride_a]); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
@ -343,6 +345,14 @@ GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_sse2,
|
||||
WELS_CPU_SSE2)
|
||||
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse2,
|
||||
GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_SSE2)
|
||||
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_ssse3, GeneralBilinearFastDownsampler_ref, 1,
|
||||
WELS_CPU_SSSE3)
|
||||
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse41,
|
||||
GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_SSE41)
|
||||
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_avx2, GeneralBilinearFastDownsampler_ref, 1,
|
||||
WELS_CPU_AVX2)
|
||||
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_avx2,
|
||||
GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_AVX2)
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON)
|
||||
|
Loading…
x
Reference in New Issue
Block a user